In [120]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm

In [121]:
class RegressionChallenge():
    def __init__(self,
                 train_path,
                 test_path):
        self.train_file = train_path
        self.test_file = test_path
        tqdm.pandas()
        
        
    def prepare_data(self):
        train_data = pd.read_csv(self.train_file)
        test_data= pd.read_csv(self.test_file)

        # Separate features and response variable in training data
        X_train = train_data.iloc[:, 1:-1]
        X_test = test_data.iloc[:, 1:]
        y_train = train_data.iloc[:, -1]
        
        X_train.info()
        X_test.info()

        # Perform feature scaling (if needed)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        return X_train, y_train, X_test
    
    def train_linear_regression(self,
                                X_train,
                                y_train):
        ridge = Ridge()
        param_grid = {'alpha': [0.01, 0.1, 1, 10]}  # Example: alpha values to tune
        grid_search = GridSearchCV(ridge, param_grid, cv=5)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_

        # Save the best estimator using pickle
        with open('../out/linear_regression_model.pkl', 'wb') as file:
            pickle.dump(best_estimator, file)
            
        return best_estimator, best_params
    
    def predict_linear_regression(self,
                                  lm, 
                                  X_test):
        y_pred_lm = lm.predict(X_test)
        return y_pred_lm
    
    def train_knn(self,
                  X_train, 
                  y_train):
        knn = KNeighborsRegressor()
        param_grid = {
            'n_neighbors': [1,2,3, 5, 7],  # Example: k values to tune
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'leaf_size': [10, 30, 50],
            'p': [1, 2, 3]  # Example: different values of p    
        }
        grid_search = GridSearchCV(knn, param_grid, cv=5)
        grid_search.fit(X_train, y_train)
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_
        best_resuts = grid_search.cv_results_

        # Save the best estimator using pickle
        with open('../out/knn_model.pkl', 'wb') as file:
            pickle.dump(best_estimator, file)

        return best_estimator, best_params, best_resuts

    def predict_knn(self,
                    knn,
                    X_test):
        y_pred_knn = knn.predict(X_test)
        return y_pred_knn
    
    
    def evaluate_model(self,
                       y_true, 
                       y_pred, 
                       model_name):
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        evaluation = pd.DataFrame({'Model': [model_name], 'MSE': [mse], 'MAE': [mae], 'R2': [r2]})
        return evaluation
    
    def show_selected_parameters(slef,model_name, best_params):
        print(f"Selected parameters for {model_name}:")
        for param, value in best_params.items():
            print(f"{param}: {value}")
  

In [122]:
solver = RegressionChallenge(
    '../data/train_ch.csv',
    '../data/test_ch.csv'
)

X_train, y_train, X_test = solver.prepare_data()

# Train and predict with Linear Regression
lm, lm_params = solver.train_linear_regression(X_train, y_train)
y_pred_lm = cross_val_predict(lm, X_train, y_train, cv=5)

# Train and predict with KNN Regression
knn, knn_params, knn_results = solver.train_knn(X_train, y_train)
y_pred_knn = cross_val_predict(knn, X_train, y_train, cv=5)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   v1      1000 non-null   float64
 1   v2      1000 non-null   float64
 2   v3      1000 non-null   float64
 3   v4      1000 non-null   float64
 4   v5      1000 non-null   float64
 5   v6      1000 non-null   float64
 6   v7      1000 non-null   float64
 7   v8      1000 non-null   float64
 8   v9      1000 non-null   float64
dtypes: float64(9)
memory usage: 70.4 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   v1      100 non-null    float64
 1   v2      100 non-null    float64
 2   v3      100 non-null    float64
 3   v4      100 non-null    float64
 4   v5      100 non-null    float64
 5   v6      100 non-null    float64
 6   v7      100 non-null    float64
 7   v8    

In [123]:
evaluation = []
print(solver.evaluate_model(y_train, y_pred_lm, 'Linear Regression'))
print(solver.evaluate_model(y_train, y_pred_knn, 'KNN Regression'))


               Model         MSE        MAE        R2
0  Linear Regression  222.305002  12.783789  0.996002
            Model          MSE       MAE        R2
0  KNN Regression  5328.330083  58.58787  0.904177


In [124]:
# Save the predictions from all models to a CSV file
y_test_predict_lm = solver.predict_linear_regression(lm, X_test )
y_test_predict_knn = solver.predict_knn(knn, X_test )
predictions_df = pd.DataFrame({'pred_lm': y_test_predict_lm, 'pred_knn': y_test_predict_knn})
predictions_df.to_csv('../out/predictions.csv', index=False)

In [125]:
# Show selected parameters for each model
solver.show_selected_parameters('KNN Regression', knn_params)
solver.show_selected_parameters('Linear Regression', lm_params)

Selected parameters for KNN Regression:
algorithm: auto
leaf_size: 10
n_neighbors: 7
p: 3
weights: distance
Selected parameters for Linear Regression:
alpha: 1
