In [36]:
import numpy as np
import pandas as pd

train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

from sklearn.model_selection import train_test_split

def year(date_time):
    return int(date_time[:4])

def month(date_time):
    return int(date_time[5:7])

#적용
train['year'] = train['date_time'].apply(year)
test['year'] = test['date_time'].apply(year)

train['month'] = train['date_time'].apply(month)
test['month'] = test['date_time'].apply(month)

#date_time 제거
train=train.drop(['date_time'], axis=1)
test=test.drop(['date_time'], axis=1)

X=train.drop(columns=['number_of_rentals', 'wind_direction', 'low_temp', 'humidity'])
y=train['number_of_rentals']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# X용 스케일러
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_valid_scaled = sc_X.transform(X_valid)
X_test_scaled = sc_X.transform(test.drop(columns=['wind_direction', 'low_temp', 'humidity']))

# y용 스케일러
sc_y = StandardScaler()
y_train_scaled = sc_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_valid_scaled = sc_y.transform(y_valid.values.reshape(-1, 1)).ravel()


from sklearn.svm import SVR
model=SVR(kernel='rbf', C=50.0, epsilon=0.5, gamma=0.1)
model.fit(X_train_scaled, y_train_scaled)

y_train_pred = model.predict(X_train_scaled)
y_valid_pred = model.predict(X_valid_scaled)
y_test_pred = model.predict(X_test_scaled)

y_train_pred_original = sc_y.inverse_transform(y_train_pred.reshape(-1, 1)).ravel()
y_valid_pred_original = sc_y.inverse_transform(y_valid_pred.reshape(-1, 1)).ravel()
y_test_pred_original = sc_y.inverse_transform(y_test_pred.reshape(-1, 1)).ravel()

def nmae(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

train_nmae = nmae(y_train, y_train_pred_original)
valid_nmae = nmae(y_valid, y_valid_pred_original)

print(f"Train NMAE: {train_nmae:}")
print(f"Validation NMAE: {valid_nmae:}")

submission['number_of_rentals']=y_test_pred_original
submission.to_csv('SVR_ver3_submission.csv', index=False)


Train NMAE: 0.21759201077956303
Validation NMAE: 0.6335972019755821


In [21]:
import os
import pandas as pd

folder_path = r"c:\Users\김은교\OneDrive\바탕 화면\BRAIN\Dacon-project\bicycle_rentals_prediction"
file_list = os.listdir(folder_path)

for filename in file_list:
    file_path = os.path.join(folder_path, filename)
    try:
        if filename.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
                print(content[:200])  # 앞부분만 출력

        elif filename.endswith(".csv"):
            df = pd.read_csv(file_path, encoding="utf-8")
            print(df.head())

        else:
            print(f"Skipping unsupported file type: {filename}")

    except PermissionError:
        print(f"Permission denied: {file_path}")
    except IsADirectoryError:
        print(f"Skipping directory: {file_path}")
    except UnicodeDecodeError:
        print(f"Encoding error: {file_path}")


Skipping unsupported file type: .ipynb_checkpoints
Skipping unsupported file type: 1차
Skipping unsupported file type: 2조_멘티.pdf
Skipping unsupported file type: 2차 발표 (1).pdf
Skipping unsupported file type: 2차 발표 초안.pptx
Skipping unsupported file type: 2차 발표.pdf
Skipping unsupported file type: 2차 발표.pptx
Skipping unsupported file type: 2차 발표_최종.pdf
    date_time  number_of_rentals
0  2021-04-01       40794.474573
1  2021-04-02       35584.415451
2  2021-04-03       33357.655951
3  2021-04-04       31480.311211
4  2021-04-05       64507.239850
Skipping unsupported file type: Graident Boost model.ipynb
Skipping unsupported file type: lasso.ipynb
Skipping unsupported file type: Linear regression, SVR.pptx
Skipping unsupported file type: linear regression.ipynb
Skipping unsupported file type: minseo.ipynb
Skipping unsupported file type: model selection(1).ipynb
Skipping unsupported file type: ridge.ipynb
    date_time  number_of_rentals
0  2021-04-01                  0
1  2021-04-02        

In [25]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.5, 0.1, 1, 5, 10, 50],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
    'gamma': [0.01, 0.05, 0.1, 0.5, 'scale']
}

from sklearn.metrics import make_scorer

def NMAE(true, pred):
    return np.mean(np.abs(true - pred) / true)
NMAE_scorer = make_scorer(NMAE, greater_is_better=False) # NMAE가 작을수록 좋음

grid = GridSearchCV(SVR(kernel='rbf'), param_grid, scoring=NMAE_scorer, cv=5, n_jobs=1)
grid.fit(X_train_scaled, y_train_scaled)

print("Best params:", grid.best_params_)


Best params: {'C': 50, 'epsilon': 0.1, 'gamma': 0.1}


In [23]:

grid = GridSearchCV(SVR(kernel='rbf'), param_grid, scoring=NMAE_scorer, cv=5, n_jobs=1)
grid.fit(X_train_scaled, y_train_scaled)

print("Best params:", grid.best_params_)


Best params: {'C': 50, 'epsilon': 0.1, 'gamma': 0.1}


In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.svm import SVR

best_score = -np.inf
best_components = None

for n_components in range(1, X.shape[1] + 1):  # 특성 개수까지 시도
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_train_scaled)
    model = SVR(kernel='rbf', C=100.0, epsilon=0.2, gamma=0.01)
    model.fit(X_pca, y_train_scaled)
    # 교차 검증 점수 계산
    scores = cross_val_score(model, X_pca, y_train_scaled, cv=5)  # 5-폴드 교차 검증
    mean_score = np.mean(scores)

    if mean_score > best_score:
        best_score = mean_score
        best_components = n_components

print(f"최적의 주성분 개수: {best_components}")
print(f"최고 점수: {best_score}")

최적의 주성분 개수: 7
최고 점수: 0.8538611770146899
