## Data Analysis Notebook
This notebook demonstrates the process of loading data, preprocessing it, and preparing it for modeling.

In [15]:
# Import pandas library to handle data in DataFrame structures
import pandas as pd

### Data Loading
Here we load the necessary data from an Excel file. It is important to adjust the path to where your file is located.

In [16]:
# Load data from an Excel file into a pandas DataFrame
data = pd.read_excel('/content/EVDS.xlsx')
data

Unnamed: 0,Tarih,TP N2SY01 2005
0,2005-01,83.73
1,2005-02,85.98
2,2005-03,98.50
3,2005-04,95.00
4,2005-05,99.10
...,...,...
91,2012-08,117.23
92,2012-09,135.54
93,2012-10,130.76
94,2012-11,141.02


In [17]:
# Create lag features by shifting the data column by 1, 2, and 3 periods
lag_data1=data['TP N2SY01 2005'].shift(1)
lag_data2=data['TP N2SY01 2005'].shift(2)
lag_data3=data['TP N2SY01 2005'].shift(3)

### Feature Engineering
Creating lag features to use in time series forecasting or regression analysis.

In [18]:
data_with_lag = pd.concat([data, lag_data1, lag_data2, lag_data3], axis=1)
data_with_lag.columns = list(data.columns) + ['lag1', 'lag2', 'lag3']
data_with_lag = data_with_lag.dropna()
y=data_with_lag.iloc[:,1]
X=data_with_lag.iloc[:,2:]

### Data Splitting
Splitting the data into training and test sets to evaluate the performance of our model objectively.

In [19]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=0,
                                                   test_size=0.2)
X_train=X[0:72]
y_train=y[0:72]
X_test=X[73:93]
y_test=y[73:93]

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


In [21]:
svr=SVR()
param_range = [0.1, 1.0, 10.0]
param_grid = {  'C': param_range}
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'kernel': ['linear', 'rbf'],
              'gamma': ['scale', 'auto']}
gs_SVR = GridSearchCV(estimator=svr, param_grid=param_grid,
                  scoring='neg_mean_squared_error',cv=2,refit=True, n_jobs=-1)
gs_SVR = gs_SVR.fit(X_train, y_train)
print(gs_SVR.best_score_)
print(gs_SVR.best_params_)

-70.15650995162387
{'C': 0.01, 'gamma': 'scale', 'kernel': 'linear'}


In [22]:
best_svr=gs_SVR.best_estimator_
y_pred_svr = best_svr.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
print("Mean Squared Error: ", mse_svr)


Mean Squared Error:  105.64817882012349


In [23]:
from sklearn.tree import DecisionTreeRegressor
decison_tree = DecisionTreeRegressor()
param_grid = {'max_depth': [None, 5, 10, 15],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['auto', 'sqrt', 'log2']}

In [24]:
gs_tree = GridSearchCV(decison_tree, param_grid, cv=5, scoring='neg_mean_squared_error')
gs_tree.fit(X_train, y_train)
best_tree = gs_tree.best_estimator_
y_pred_tree = best_tree.predict(X_test)
mse_tree = mean_squared_error(y_test, y_pred_tree)
print("Mean Squared Error: ", mse_tree)
print(best_tree)



Mean Squared Error:  192.1702838520412
DecisionTreeRegressor(max_depth=5, max_features='sqrt', min_samples_leaf=2)


In [25]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor()
param_grid = {'n_estimators': [10, 15, 20],
              'max_depth': [None, 5, 10, 15],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['auto', 'sqrt', 'log2']}

In [26]:
gs_forest = GridSearchCV(random_forest, param_grid, cv=5, scoring='neg_mean_squared_error')
gs_forest.fit(X_train, y_train)
best_forest = gs_forest.best_estimator_
y_pred_forest = best_forest.predict(X_test)
mse_forest = mean_squared_error(y_test, y_pred_forest)
print(best_forest)
print(mse_forest)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


RandomForestRegressor(max_depth=10, max_features='log2', min_samples_split=5,
                      n_estimators=15)
124.32077213100528


In [33]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']}
gs_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='neg_mean_squared_error')
gs_knn.fit(X_train, y_train)
best_knn = gs_knn.best_estimator_
print("Best Parameters: ", gs_knn.best_params_)
y_pred_knn = best_knn.predict(X_test)

Best Parameters:  {'n_neighbors': 7, 'weights': 'distance'}


In [34]:
mse_knn = mean_squared_error(y_test, y_pred_knn)
print(best_knn)
print("Mean Squared Error: ", mse_knn)

KNeighborsRegressor(n_neighbors=7, weights='distance')
Mean Squared Error:  195.3776226877704
