# 1) Importing Libraries

In [1]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,LogisticRegression
from sklearn.metrics import mean_absolute_error ,mean_squared_error, median_absolute_error,confusion_matrix,accuracy_score,r2_score
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler ,PolynomialFeatures,minmax_scale,MaxAbsScaler ,LabelEncoder
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

# 2) Loading the datasets

In [2]:
train_df=pd.read_csv(r"C:\Users\Balasubramanian\Desktop\work\ML-Assignment\new_train.csv")
test_df=pd.read_csv(r"C:\Users\Balasubramanian\Desktop\work\ML-Assignment\new_test.csv")

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,category_id,video_id,gender,profession,followers,views,age_boxcox,es_boxcox
0,0,19990,37,128,Male,Student,180,1000,3.607893,22.944055
1,1,5304,32,132,Female,Student,330,714,2.980037,4.498761
2,2,1840,12,24,Male,Student,180,138,3.33088,23.155705
3,3,12597,23,112,Male,Student,220,613,3.33088,17.464412
4,4,13626,23,112,Male,Working Professional,220,613,3.750236,12.22271


# 3) Feature engneering

In [4]:
# Training dataset
# Convert categories to numbers so that algorithm can understande in better way ( encoding)

train_df["Gender"]=pd.get_dummies(train_df["gender"],drop_first=True,dtype=int)
prof_dummy = pd.get_dummies(train_df["profession"], drop_first=True, dtype=int)
train_df = pd.concat([train_df, prof_dummy], axis=1)

train_df.head() 

Unnamed: 0.1,Unnamed: 0,user_id,category_id,video_id,gender,profession,followers,views,age_boxcox,es_boxcox,Gender,Student,Working Professional
0,0,19990,37,128,Male,Student,180,1000,3.607893,22.944055,1,1,0
1,1,5304,32,132,Female,Student,330,714,2.980037,4.498761,0,1,0
2,2,1840,12,24,Male,Student,180,138,3.33088,23.155705,1,1,0
3,3,12597,23,112,Male,Student,220,613,3.33088,17.464412,1,1,0
4,4,13626,23,112,Male,Working Professional,220,613,3.750236,12.22271,1,0,1


- We apply encoding to categorical features on training dataset

In [5]:
# Testing dataset
# Convert categories to numbers so that algorithm can understande in better way (Ordinal encoding)
test_df["Gender"]=pd.get_dummies(test_df["gender"],drop_first=True,dtype=int)
prof_dummy = pd.get_dummies(test_df["profession"], drop_first=True, dtype=int)
test_df = pd.concat([test_df, prof_dummy], axis=1)
test_df.head() 

Unnamed: 0.1,Unnamed: 0,user_id,category_id,video_id,gender,profession,followers,views,age_boxcox,Gender,Student,Working Professional
0,0,7986,12,42,Male,Student,180,138,2.92632,1,1,0
1,1,11278,34,115,Male,Student,230,840,2.92632,1,1,0
2,2,17245,8,110,Female,Working Professional,280,628,4.247256,0,0,1
3,3,9851,16,137,Male,Student,270,462,3.203661,1,1,0
4,4,16008,34,96,Female,Other,230,840,4.327436,0,0,0


- We apply encoding to categorical features on testing dataset

# 4) Feature selection

In [6]:
# Training dataset
# droping the columns which are not required

train_df.drop(columns=["Unnamed: 0","gender","profession"],inplace=True)
train_df.head()

Unnamed: 0,user_id,category_id,video_id,followers,views,age_boxcox,es_boxcox,Gender,Student,Working Professional
0,19990,37,128,180,1000,3.607893,22.944055,1,1,0
1,5304,32,132,330,714,2.980037,4.498761,0,1,0
2,1840,12,24,180,138,3.33088,23.155705,1,1,0
3,12597,23,112,220,613,3.33088,17.464412,1,1,0
4,13626,23,112,220,613,3.750236,12.22271,1,0,1


In [7]:
# Testinging dataset
# droping the columns which are not required

test_df.drop(columns=["Unnamed: 0","gender","profession"],inplace=True)
test_df.head()

Unnamed: 0,user_id,category_id,video_id,followers,views,age_boxcox,Gender,Student,Working Professional
0,7986,12,42,180,138,2.92632,1,1,0
1,11278,34,115,230,840,2.92632,1,1,0
2,17245,8,110,280,628,4.247256,0,0,1
3,9851,16,137,270,462,3.203661,1,1,0
4,16008,34,96,230,840,4.327436,0,0,0


- We are done with Analysis of whole dataset 

# 5) Model Training

In [8]:
# Splitting the dataset
train_copy = train_df.copy()
test_copy=test_df.copy()

In [9]:
x_train=train_copy.drop(columns=["es_boxcox"])
y_train=train_copy["es_boxcox"]
x_test=test_copy

# A) Linear reggression

In [10]:
# creating and fitting a regression model
lin_reg=LinearRegression()
lin_reg.fit(x_train,y_train)

In [11]:
y_pred=lin_reg.predict(x_train)
y_pred

array([18.40536635, 14.33472781, 20.70930704, ..., 20.57092809,
       14.69497797, 20.2547378 ])

In [12]:
print("Mean absolute error is ", mean_absolute_error(y_train,y_pred))
print("Mean squared  error is " , mean_squared_error(y_train,y_pred))
print("Sum of squared error is " ,mean_squared_error(y_train, y_pred) * len(y_train))

print(" R2 Score Regression " ,r2_score(y_train,y_pred))

Mean absolute error is  4.480029766727431
Mean squared  error is  30.8330654260573
Sum of squared error is  2750216.936808033
 R2 Score Regression  0.28012074029348133


# A1) Ridge Regression (L2)

In [13]:
ridge=Ridge()
ridge.fit(x_train,y_train)

In [14]:
# Define the hyperparameter grid to search
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

In [15]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(ridge, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(x_train, y_train)

In [16]:
best_alpha = grid_search.best_params_['alpha']
best_alpha

1

In [17]:
y_pred1=ridge.predict(x_train)
y_pred1

array([18.40499909, 14.33490857, 20.70908585, ..., 20.57055535,
       14.69490972, 20.25470784])

In [18]:
print("Mean absolute error is ", mean_absolute_error(y_train,y_pred1))
print("Mean squared  error is " , mean_squared_error(y_train,y_pred1))
print("Sum of squared error is " ,mean_squared_error(y_train, y_pred1) * len(y_train))

print(" R2 Score Regression " ,r2_score(y_train,y_pred1))

Mean absolute error is  4.480041215215037
Mean squared  error is  30.833065460390312
Sum of squared error is  2750216.939870435
 R2 Score Regression  0.28012073949188654


# A2) Lasso Regression (L1)¶

In [19]:
lasso=Lasso()
lasso.fit(x_train,y_train)

In [20]:
# Define the hyperparameter grid to search
param_grid = {'alpha': [1e-15,1e-10,1e-08,0.00001,0.0001,0.001, 0.01, 0.1, 1]}

In [21]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(lasso, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(x_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [22]:
best_alpha = grid_search.best_params_['alpha']
best_alpha

1e-15

In [23]:
new_lasso=Lasso(alpha=1e-08)
new_lasso.fit(x_train,y_train)

In [24]:
y_pred2=new_lasso.predict(x_train)
y_pred2

array([18.40536621, 14.33472788, 20.70930698, ..., 20.57092795,
       14.69497796, 20.25473784])

In [25]:
print("Mean absolute error is ", mean_absolute_error(y_train,y_pred2))
print("Mean squared  error is " , mean_squared_error(y_train,y_pred2))
print("Sum of squared error is " ,mean_squared_error(y_train, y_pred2) * len(y_train))

print(" R2 Score Regression " ,r2_score(y_train,y_pred2))

Mean absolute error is  4.480029769015892
Mean squared  error is  30.833065426057303
Sum of squared error is  2750216.9368080334
 R2 Score Regression  0.2801207402934812


In [26]:
print(" R2 Score linear Regression " ,r2_score(y_train,y_pred))
print(" R2 Score ridge Regression " ,r2_score(y_train,y_pred1))
print(" R2 Score lasso Regression " ,r2_score(y_train,y_pred2))

 R2 Score linear Regression  0.28012074029348133
 R2 Score ridge Regression  0.28012073949188654
 R2 Score lasso Regression  0.2801207402934812


# B) Decision tree regression

In [27]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(x_train,y_train)

In [28]:
# Define the hyperparameter grid to search
param_grid = {
#     'max_depth': [10,5,1],
#     'criterion':['absolute_error', 'friedman_mse', 'poisson', 'squared_error'],
#     'splitter':["best", "random"],
    'max_features':[1,2,3,4,5,6,7,8,9]
    
}

In [29]:
grid_search = GridSearchCV(dt_reg, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(x_train, y_train)

In [30]:
best_params = grid_search.best_params_
best_params

{'max_features': 9}

In [31]:
new_dt_reg=DecisionTreeRegressor(criterion='friedman_mse',splitter='best',max_depth=None,min_samples_split=4,min_samples_leaf=15,
    min_weight_fraction_leaf=0.0,
    max_features=3,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    ccp_alpha=0.0)
new_dt_reg.fit(x_train,y_train)

In [32]:
y_pred3=dt_reg.predict(x_train)
y_pred3

array([22.94405542,  4.49876141, 23.15570492, ..., 21.90250596,
       17.46441224, 22.73352141])

In [33]:
print("Mean absolute error is ", mean_absolute_error(y_train,y_pred3))
print("Mean squared  error is " , mean_squared_error(y_train,y_pred3))
print("Sum of squared error is " ,mean_squared_error(y_train, y_pred3) * len(y_train))

print(" R2 Score Regression " ,r2_score(y_train,y_pred3))

Mean absolute error is  1.3885369819326133e-05
Mean squared  error is  4.299373336312076e-06
Sum of squared error is  0.38349120347902826
 R2 Score Regression  0.9999998996197864


# C) Random forest regressor

In [34]:
rf_reg = RandomForestRegressor(random_state=42,)
rf_reg.fit(x_train,y_train)

In [35]:
# Define the hyperparameter grid to search
param_grid = {
#     'n_estimators': [145,140,150],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [36]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(rf_reg, param_grid, scoring='neg_mean_squared_error', cv=5,n_jobs=3)
grid_search.fit(x_train, y_train)

5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Balasubramanian\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Balasubramanian\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\Balasubramanian\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Balasubramanian\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_

In [37]:
best_params = grid_search.best_params_
best_params

{'max_features': 'sqrt'}

In [38]:
new_rf_reg=RandomForestRegressor(random_state=42,n_estimators=145,max_depth=10,min_samples_split=10,min_samples_leaf=4,max_features='sqrt')
new_rf_reg.fit(x_train,y_train)

In [39]:
y_pred4=rf_reg.predict(x_train)
y_pred4

array([22.62999815,  6.5262007 , 21.70064996, ..., 21.55675158,
       17.47847385, 21.82924777])

In [40]:
print("Mean absolute error is ", mean_absolute_error(y_train,y_pred4))
print("Mean squared  error is " , mean_squared_error(y_train,y_pred4))
print("Sum of squared error is " ,mean_squared_error(y_train, y_pred4) * len(y_train))

print(" R2 Score Regression " ,r2_score(y_train,y_pred4))

Mean absolute error is  1.4410423568424704
Mean squared  error is  3.5426803414797727
Sum of squared error is  315996.4584189713
 R2 Score Regression  0.9172867807219061
