In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import seaborn as sns
sns.set(style='whitegrid')
import warnings 
warnings.filterwarnings('ignore')
from scipy.stats import zscore
from sklearn.linear_model import LogisticRegression,LinearRegression,Lasso,Ridge
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
import xgboost
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,accuracy_score
#Module related to VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor
#Moudles related to feature selection
from sklearn.feature_selection import RFE,SelectKBest,f_classif

In [2]:
# Create Data audit Report for continuous variables
def continuous_var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  
                      x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),
                          x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), 
                              x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max()], 
                  index = ['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1', 
                               'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

In [None]:
# load the first csv file 
df1 = pd.read_csv('E:/DataTrained/Capstone_Project/table1.csv')
print(df1.head())

# load the second csv file
df2=pd.read_csv('E:/DataTrained/Capstone_Project/table_2.csv')
print(df2.head())

# load the first csv file 
df1 = pd.read_csv('E:/DataTrained/Capstone_Project/table_3.csv')
print(df1.head())



In [None]:
# Merge the 2 files to one dataset csv file.
df=pd.merge(df1,df2,how='inner',left_on='shoe_Name',right_on='shoe_Name')
#Drop the duplicated column
df.drop('shoe_Name',axis=1,inplace=True)
pd. set_option('display.max_columns', 700)
pd.set_option('display.width', 100)
df

In [None]:
# Shape of the dataset
df.shape

In [None]:
# check dulpicated records
df.duplicated().sum()

In [None]:
# Let check if any whitespace, 'NA' or '-' exist in dataset.
df.isin([' ','NA','-']).sum().any()

In [None]:
# Dataset information
df.info()

In [None]:
df.price.max()

In [None]:
# Show the columns
df.columns

In [None]:
# Check null values in  data
df.isnull().sum()

In [None]:
# Value counts for every feature to check the missing again or unique values
for i in df.columns:
    print(df[i].value_counts())
    print('**********************************')

In [None]:
# Handling Missing values in dataset.
df['Certification'] =df.Certification.replace(np.nan,'Not Rated')
df.shoe_name.fillna(0.0,inplace=True)
df.color_1.fillna('NA',inplace=True)
df.color_2.fillna('NA',inplace=True)
df.color_3.fillna('NA',inplace=True)
df.color_4.fillna('NA',inplace=True)
df.reviews.fillna('NA',inplace=True)
df.size.fillna('NA',inplace=True)
df.style_code.fillna('NA',inplace=True)

# Using knn imputer for both metascore and gross
from sklearn.impute  import KNNImputer
imputer=KNNImputer(n_neighbors=5)
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Use some features of the data
data=df[['shoe_name','size','style_code']]
# Perform Min-Max Scaling
scaled_data = scaler.fit_transform(data)

# Fit the imputer on the data
imputer.fit(scaled_data)
# Replacing the missing values
data=imputer.transform(scaled_data)

In [None]:
# Reverse the scaling
unscaled_data = scaler.inverse_transform(data)

In [None]:
# Check the null values again
df.isnull().sum()

In [None]:
# Using knn imputer for both metascore and gross
from sklearn.impute  import KNNImputer
imputer=KNNImputer(n_neighbors=5)
# Initialize MinMaxScaler
scaler = MinMaxScaler()
# Use some features of the data
data=df[['Reviews','category','comfort']]
# Perform Min-Max Scaling
scaled_data = scaler.fit_transform(data)

# Fit the imputer on the data
imputer.fit(scaled_data)
# Replacing the missing values
data=imputer.transform(scaled_data)

In [None]:
# Reverse the scaling
unscaled_data = scaler.inverse_transform(data)

In [None]:
# split train data to continous and categorical varaiables.
num=df.select_dtypes(include=['float64','int64'])
cat=df.select_dtypes(include=['object'])

# split train data to continous and categorical varaiables.
num_gross=df_reviews.select_dtypes(include=['float64','int64'])
cat_gross=df_category.select_dtypes(include=['object'])

In [None]:
# Descriptive analysis for continous data
num.apply(continuous_var_summary)

In [None]:
# Descriptive analysis for categorical data
cat.describe(include='O')

In [None]:
# Check Normality of continous data
for i in num.columns:
    sns.distplot(num[i])
    plt.show()

In [None]:
# Check count of categorical data for the best 7
for i in cat.columns.difference(['color_2','color_3']):
    cat[i].value_counts().sort_values(ascending=False).head(10).plot(kind='bar', color='r')
    plt.xlabel(i)
    plt.ylabel('Count')
    plt.title(f'Top 10 Categories for {i}')
    plt.show()

In [None]:

for col in ['star_rating','Reviews','category','size','comfort','no_of_colors']:
    top_rated= df.sort_values(by=col ,ascending=False).head(20)
    figure=px.bar(top_rated,x='shoe_name',y=col)
    figure.show()

In [None]:
# Scatter plot between continous features and target variable (Votes) 
for i in num.columns.difference(['style_code','shoe_name']):
    sns.scatterplot(x=i, y='shoe_name', data=num)
    plt.show()

In [None]:
# Observe the Comparison between variables?

sns.pairplot(df)
plt.show()

In [None]:
# Check outliers using boxplot
for col in ['durability',  'star_rating','price]:
    sns.boxplot(df[col])
    plt.show()

In [None]:
# use Z-score method
outliers=df[['durability', 'star_rating','price']]
z=np.abs(zscore(outliers))
df_new=df[(z<3).all(axis=1)]
df_new

In [None]:
# data loss percentage

loss_percentage= ((df.shape[0]- df_new.shape[0])/df.shape[0])*100
loss_percentage

In [None]:
# Check Skewness
df_new.skew()

In [None]:
# withing Power transformer to treat skewness
scaler = PowerTransformer()
df_new[skew_cols] = scaler.fit_transform(df_new[skew_cols].values)

In [None]:
# Check Skewness
df_new.skew()

In [None]:
# drop unnecessary columns
df_new.drop(columns=['shoe_name],axis=1,inplace=True)

In [None]:
# Use getdummies and frequency for large  for nominal features 
    
df_new=pd.get_dummies(df_new,columns=['color_1','color_2','color_3'],drop_first=True)

In [None]:
# heatmap correlation
plt.figure(figsize = (22,12))
sns.heatmap(df_new.corr(), annot=True,cmap='summer',fmt='.2f')
plt.show()

In [None]:
# use Z-score method
outliers=df_gross[['Durability', 'star_rating','price']]
z=np.abs(zscore(outliers))
dfgross_new=df_gross[(z<3).all(axis=1)]
dfgross_new

In [None]:
# data loss percentage

loss_percentage= ((df_gross.shape[0]- dfgross_new.shape[0])/df_gross.shape[0])*100
loss_percentage

In [None]:
# Check Skewness
dfgross_new.skew()

In [None]:
# heatmap correlation
plt.figure(figsize = (22,12))
sns.heatmap(dfgross_new.corr(), annot=True,cmap='summer',fmt='.2f')
plt.show()

In [None]:
# proce is the target feature now.
x = df_new.drop('price',axis=1)
y = df_new['price']

In [None]:
sc= StandardScaler()
x = pd.DataFrame(sc.fit_transform(x), columns = x.columns)
x

In [None]:
from sklearn.feature_selection import f_regression
F_values, p_values  = f_regression(x, y )

In [None]:
import itertools
f_reg_results = [(i, v, z) for i, v, z in itertools.zip_longest(x.columns, F_values,  ['%.3f' % p for p in p_values])]
f_reg_results=pd.DataFrame(f_reg_results, columns=['Variable','F_Value', 'P_Value'])

In [None]:
f_reg_results=pd.DataFrame(f_reg_results, columns=['Variable','F_Value', 'P_Value'])
f_reg_results = f_reg_results.sort_values(by=['P_Value']).head(25)

In [None]:
f_reg_results

In [None]:
l1_fr=f_reg_results['Variable'][:-1].values  # CI=0.90
l1_fr

In [None]:
rfe = RFE(RandomForestRegressor(), n_features_to_select=17).fit(x,y)

In [None]:
l2_rfe = x.columns[rfe.get_support()]

In [None]:
l2_rfe

In [None]:
SKB = SelectKBest(f_classif, k=17).fit(x, y )

In [None]:
l3_skb = x.columns[SKB.get_support()]

In [None]:
l3_skb

In [None]:
list(set(list(l1_fr)+list(l2_rfe)+list(l3_skb)))

In [None]:
x = x[final_list]
y = y

In [None]:
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns

In [None]:
vif.sort_values(by='VIF_Factor', ascending=False, inplace=True)
vif

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    lr= LinearRegression()
    lr.fit(x_train,y_train)
    pred = lr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=max_RS)

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)
pred_train=lr.predict(x_train)
pred_test=lr.predict(x_test)
score_train=r2_score(y_train,pred_train)
score_test=r2_score(y_test,pred_test)
mse = mean_squared_error(y_test, pred_test)

print('R2_Score_train: ',score_train)
print('R2_Score_test: ',score_test)
print('RMSE = ', np.sqrt(mse).round(4))
print('The mean absolute error = ', mean_absolute_error(y_test, pred_test))

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-LR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
for i in range(2,11):
    cv_score=cross_val_score(lr,x,y,scoring='r2',cv=i)
    cv_mean= cv_score.mean()
    cv_std= cv_score.std()
    print(f'At Cross fold {i} the cv score mean is {cv_mean*100} and the cv score std is {cv_std},testing accuracy score= {score_test*100}')
    print('\n')

In [None]:
# Define the degree of polynomial features
degree = 2

# Create an instance of PolynomialFeatures
poly_features = PolynomialFeatures(degree=degree)

# Fit and transform the training features to polynomial features
X_train_poly = poly_features.fit_transform(x_train)

# Transform the testing features to polynomial features
X_test_poly = poly_features.transform(x_test)

In [None]:
# Create an instance of Ridge regression
ridge_model = Ridge()

ridge_model.fit(X_train_poly, y_train)

pred_train_ridge=ridge_model.predict(X_train_poly)
pred_test_ridge = ridge_model.predict(X_test_poly)

score_train=r2_score(y_train,pred_train_ridge)
score_test=r2_score(y_test,pred_test_ridge)
mse = mean_squared_error(y_test, pred_test_ridge)


# Print the mean squared error
print('R2_Score_train: ',score_train)
print('R2_Score_test: ',score_test)
print('RMSE = ', np.sqrt(mse).round(4))
print('The mean absolute error = ', mean_absolute_error(y_test, pred_test_ridge))

In [None]:
sns.regplot(x=y_test,y=pred_test_ridge)
plt.title("Model-Ridge")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
# Create an instance of Lasso regression
lasso_model = Lasso()


lasso_model.fit(X_train_poly, y_train)

pred_train_lasso=lasso_model.predict(X_train_poly)
pred_test_lasso = lasso_model.predict(X_test_poly)

score_train=r2_score(y_train,pred_train_ridge)
score_test=r2_score(y_test,pred_test_ridge)
mse = mean_squared_error(y_test, pred_test_lasso)


# Print the mean squared error
print('R2_Score_train: ',score_train)
print('R2_Score_test: ',score_test)
print('RMSE = ', np.sqrt(mse).round(4))
print('The mean absolute error = ', mean_absolute_error(y_test, pred_test_lasso))

In [None]:
sns.regplot(x=y_test,y=pred_test_lasso)
plt.title("Model-Lasso")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
   
    knn = KNeighborsRegressor()
    knn.fit(x_train,y_train)
    pred = knn.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=knn.predict(x_train)
pred_test = knn.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(knn, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-KNN")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.

max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    svr = SVR()
    svr.fit(x_train,y_train)
    pred = svr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=svr.predict(x_train)
pred_test = svr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(svr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-SVR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.

max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
  
    dtr = DecisionTreeRegressor()
    dtr.fit(x_train,y_train)
    pred = dtr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=dtr.predict(x_train)
pred_test= dtr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(dtr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
   
    rfr = RandomForestRegressor()
    rfr.fit(x_train,y_train)
    pred = rfr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=rfr.predict(x_train)
pred_test= rfr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(rfr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-RFR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.

max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
   
    gbr=GradientBoostingRegressor()
    gbr.fit(x_train,y_train)
    pred=gbr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=gbr.predict(x_train)
pred_test = gbr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(gbr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-GBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    
    adr=AdaBoostRegressor()
    adr.fit(x_train,y_train)
    adr_pred=adr.predict(x_test)
    acc = r2_score(y_test, adr_pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=adr.predict(x_train)
pred_test = adr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(adr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-ADAB Regressor")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    xgb=XGBRegressor()
    xgb.fit(x_train,y_train)
    xgb_pred=xgb.predict(x_test)
    acc = r2_score(y_test, xgb_pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=xgb.predict(x_train)
pred_test= xgb.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(xgb, x,y,cv=10)
print('The cross validation score', cv.mean(),cv.std())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-XGBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
np.random.seed(10)

def rmse_cv(model, x, y):
    rmse = -cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=10)
    return (rmse)

models = [LinearRegression(), Ridge(), Lasso(), SVR(), KNeighborsRegressor(),
          DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(),
          AdaBoostRegressor(), XGBRegressor()]

names = ['lr', 'ridge', 'lasso', 'svr', 'knn', 'dtr', 'rfr', 'gbr', 'adr', 'xgb']

for model, name in zip(models, names):
    score = rmse_cv(model, x, y)
    print("{}: {:.6f}, {:.6f}".format(name, score.mean(), score.std()))

In [None]:
# Tunning Hyperparameters for XGB Regressor

# Define the parameter grid
params = {
    'n_estimators': [100, 200,500,700,1000],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3,5,7,9],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8,1],
    'min_child_weight': [1,5,10]
}

xgb_tuned= GridSearchCV(XGBRegressor(),params,scoring='r2',cv =10,n_jobs=-1,verbose=True)
xgb_tuned.fit(x_train,y_train)
y_pred=xgb_tuned.predict(x_test)
print('The r2 score is:', r2_score(y_test, y_pred))
print("\n The best estimator across ALL searched params:\n", xgb_tuned.best_estimator_)
print("\n The best score across ALL searched params:\n", xgb_tuned.best_score_)
print("\n The best parameters across ALL searched params:\n",xgb_tuned.best_params_)

In [None]:
# Reinstating with tuned parameters
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=53)

xgb_model = XGBRegressor(n_estimators= 1000,max_depth=5,colsample_bytree=0.8, learning_rate=0.01,min_child_weight=5
                         ,subsample=0.8).fit(x_train,y_train)

pred_train=xgb_model.predict(x_train)
pred_test= xgb_model.predict(x_test)
print('The r2 score_train is: ', r2_score(y_train, pred_train))
print('The r2 score_test is: ', r2_score(y_test, pred_test))
print('The mean absolute error ', mean_absolute_error(y_test, pred_test))
print('root_mean_squared_error: ',np.sqrt(mean_squared_error(y_test,pred_test)))

In [None]:
cv = cross_val_score(xgb_model, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Tunned-XGBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
import optuna

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=53)

# Define the objective function to optimize using Optuna
def objective(trial):
    # Define the hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'lambda': trial.suggest_float('lambda', 0.0, 1.0),
    }

    # Create the XGB regressor with the current hyperparameters
    model = XGBRegressor(**params)

    # Train the model
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10, verbose=False)

    # Predict on the validation set
    y_pred = model.predict(X_valid)

    # Calculate the mean squared error as the objective to minimize
    mse = mean_squared_error(y_valid, y_pred)

    return mse

In [None]:
# Create an Optuna study object
study = optuna.create_study(direction='minimize')

# Optimize the objective function using Optuna
study.optimize(objective, n_trials=100)

# Get the best hyperparameters found by Optuna
best_params = study.best_params

In [None]:
import optuna.visualization as vis
# Visualize the optimization history
vis.plot_optimization_history(study).show()

# Visualize the importance of hyperparameters
vis.plot_param_importances(study).show()

# Visualize the slice of hyperparameters
vis.plot_slice(study).show()

In [None]:
# Train the final model using the best hyperparameters
final_model = XGBRegressor(**best_params)
final_model.fit(x, y)

In [None]:
pred_train=final_model.predict(x_train)
pred_test= final_model.predict(X_valid)
print('The r2 score_train is: ', r2_score(y_train, pred_train))
print('The r2 score_test is: ', r2_score(y_valid, pred_test))
print('The mean absolute error ', mean_absolute_error(y_valid, pred_test))
print('mean_squared_error: ',mean_squared_error(y_valid,pred_test))
print('root_mean_squared_error: ',np.sqrt(mean_squared_error(y_valid,pred_test)))

In [None]:
# Calculate the cross-validation score with the final model
cv_scores = cross_val_score(final_model, x, y, cv=10)

# Optuna maximizes the objective function, so negate the mean squared error
avg_mse = cv_scores.mean()

print("Cross-validation score :", avg_mse)

In [None]:
sns.regplot(x=y_valid,y=pred_test)
plt.title("Tunned-XGBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Saving the model using .pkl
import joblib
joblib.dump(xgb,'E:/DataTrained/Capstone_Project/IMDB_XGB.pkl')

In [None]:
# load model and get predictions

model_1=joblib.load('E:/DataTrained/Capstone_Project/IMDB_XGB.pkl')

#Predict

prediction_1 = model.predict(x_test)

In [None]:
pd.DataFrame([model_1.predict(x_test)[:], y_test[:]], index = ['Predicted', 'Original'])

In [None]:
# Votes is the target feature now.
x = dfgross_new.drop('Gross_Collection_$M',axis=1)
y = dfgross_new['Gross_Collection_$M']

In [None]:
sc= StandardScaler()
x = pd.DataFrame(sc.fit_transform(x), columns = x.columns)
x

In [None]:
F_values, p_values  = f_regression(x, y )

import itertools
f_reg_results = [(i, v, z) for i, v, z in itertools.zip_longest(x.columns, F_values,  ['%.3f' % p for p in p_values])]
f_reg_results=pd.DataFrame(f_reg_results, columns=['Variable','F_Value', 'P_Value'])

f_reg_results=pd.DataFrame(f_reg_results, columns=['Variable','F_Value', 'P_Value'])
f_reg_results = f_reg_results.sort_values(by=['P_Value']).head(25)

In [None]:
f_reg_results

In [None]:
l1_fr=f_reg_results['Variable'][:-6].values  # CI=0.90
l1_fr

In [None]:
rfe = RFE(RandomForestRegressor(), n_features_to_select=7).fit(x,y)

l2_rfe = x.columns[rfe.get_support()]

In [None]:
l2_rfe

In [None]:
SKB = SelectKBest(f_classif, k=7).fit(x, y )

l3_skb = x.columns[SKB.get_support()]

In [None]:
l3_skb

In [None]:

list(set(list(l1_fr)+list(l2_rfe)+list(l3_skb)))

In [None]:
x = x[final_list]
y = y


In [None]:
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif["features"] = x.columns

In [None]:
vif.sort_values(by='VIF_Factor', ascending=False, inplace=True)
vif

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    lr= LinearRegression()
    lr.fit(x_train,y_train)
    pred = lr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=max_RS)

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)
pred_train=lr.predict(x_train)
pred_test=lr.predict(x_test)
score_train=r2_score(y_train,pred_train)
score_test=r2_score(y_test,pred_test)
mse = mean_squared_error(y_test, pred_test)

In [None]:
print('R2_Score_train: ',score_train)
print('R2_Score_test: ',score_test)
print('RMSE = ', np.sqrt(mse).round(4))
print('The mean absolute error = ', mean_absolute_error(y_test, pred_test))

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-LR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
for i in range(2,11):
    cv_score=cross_val_score(lr,x,y,scoring='r2',cv=i)
    cv_mean= cv_score.mean()
    cv_std= cv_score.std()
    print(f'At Cross fold {i} the cv score mean is {cv_mean*100} and the cv score std is {cv_std},testing accuracy score= {score_test*100}')
    print('\n')

In [None]:
# Define the degree of polynomial features
degree = 2

# Create an instance of PolynomialFeatures
poly_features = PolynomialFeatures(degree=degree)

# Fit and transform the training features to polynomial features
X_train_poly = poly_features.fit_transform(x_train)

# Transform the testing features to polynomial features
X_test_poly = poly_features.transform(x_test)

In [None]:
# Create an instance of Ridge regression
ridge_model = Ridge()

ridge_model.fit(X_train_poly, y_train)

pred_train_ridge=ridge_model.predict(X_train_poly)
pred_test_ridge = ridge_model.predict(X_test_poly)

score_train=r2_score(y_train,pred_train_ridge)
score_test=r2_score(y_test,pred_test_ridge)
mse = mean_squared_error(y_test, pred_test_ridge)

# Print the mean squared error
print('R2_Score_train: ',score_train)
print('R2_Score_test: ',score_test)
print('RMSE = ', np.sqrt(mse).round(4))
print('The mean absolute error = ', mean_absolute_error(y_test, pred_test_ridge))

In [None]:
sns.regplot(x=y_test,y=pred_test_ridge)
plt.title("Model-Ridge")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()


In [None]:
# Create an instance of Lasso regression
lasso_model = Lasso()


lasso_model.fit(X_train_poly, y_train)

pred_train_lasso=lasso_model.predict(X_train_poly)
pred_test_lasso = lasso_model.predict(X_test_poly)

score_train=r2_score(y_train,pred_train_ridge)
score_test=r2_score(y_test,pred_test_ridge)
mse = mean_squared_error(y_test, pred_test_lasso)


# Print the mean squared error
print('R2_Score_train: ',score_train)
print('R2_Score_test: ',score_test)
print('RMSE = ', np.sqrt(mse).round(4))
print('The mean absolute error = ', mean_absolute_error(y_test, pred_test_lasso))

In [None]:
sns.regplot(x=y_test,y=pred_test_lasso)
plt.title("Model-Lasso")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
   
    knn = KNeighborsRegressor()
    knn.fit(x_train,y_train)
    pred = knn.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=knn.predict(x_train)
pred_test = knn.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(knn, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-KNN")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.

max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    svr = SVR()
    svr.fit(x_train,y_train)
    pred = svr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=svr.predict(x_train)
pred_test = svr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(svr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-SVR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.

max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
  
    dtr = DecisionTreeRegressor()
    dtr.fit(x_train,y_train)
    pred = dtr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

In [None]:
pred_train=dtr.predict(x_train)
pred_test= dtr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(dtr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-DT")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
   
    rfr = RandomForestRegressor()
    rfr.fit(x_train,y_train)
    pred = rfr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

In [None]:
pred_train=rfr.predict(x_train)
pred_test= rfr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(rfr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-RFR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.

max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)
   
    gbr=GradientBoostingRegressor()
    gbr.fit(x_train,y_train)
    pred=gbr.predict(x_test)
    acc = r2_score(y_test, pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=gbr.predict(x_train)
pred_test = gbr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(gbr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-GBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    
    adr=AdaBoostRegressor()
    adr.fit(x_train,y_train)
    adr_pred=adr.predict(x_test)
    acc = r2_score(y_test, adr_pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=adr.predict(x_train)
pred_test = adr.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(adr, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-ADAB Regressor")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
#Selecting the best random state to get maximum accuracy.
max_acc=0
max_RS=0
for i in range(1,100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=i)

    xgb=XGBRegressor()
    xgb.fit(x_train,y_train)
    xgb_pred=xgb.predict(x_test)
    acc = r2_score(y_test, xgb_pred)
    if acc > max_acc:
        max_acc= acc
        max_RS=i
print("The max accuracy is",max_acc, 'seen for random state:',max_RS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=max_RS)

pred_train=xgb.predict(x_train)
pred_test= xgb.predict(x_test)
print('The r2 score_train is:', r2_score(y_train, pred_train))
print('The r2 score_test is:', r2_score(y_test, pred_test))
print('The mean absolute error', mean_absolute_error(y_test, pred_test))
print('The mean squared error', mean_squared_error(y_test, pred_test))
cv = cross_val_score(xgb, x,y,cv=10)
print('The cross validation score', cv.mean(),cv.std())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Model-XGBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
np.random.seed(10)

def rmse_cv(model, x, y):
    rmse = -cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=10)
    return (rmse)

models = [LinearRegression(), Ridge(), Lasso(), SVR(), KNeighborsRegressor(),
          DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(),
          AdaBoostRegressor(), XGBRegressor()]

names = ['lr', 'ridge', 'lasso', 'svr', 'knn', 'dtr', 'rfr', 'gbr', 'adr', 'xgb']

for model, name in zip(models, names):
    score = rmse_cv(model, x, y)
    print("{}: {:.6f}, {:.6f}".format(name, score.mean(), score.std()))

In [None]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=22)

# Define the objective function to optimize using Optuna
def objective(trial):
    # Define the hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'lambda': trial.suggest_float('lambda', 0.0, 1.0),
    }

    # Create the XGB regressor with the current hyperparameters
    model = XGBRegressor(**params)

    # Train the model
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10, verbose=False)

    # Predict on the validation set
    y_pred = model.predict(X_valid)

    # Calculate the mean squared error as the objective to minimize
    mse = mean_squared_error(y_valid, y_pred)

    return mse

In [None]:
# Create an Optuna study object
study = optuna.create_study(direction='minimize')

# Optimize the objective function using Optuna
study.optimize(objective, n_trials=100)

# Get the best hyperparameters found by Optuna
best_params = study.best_params

In [None]:
# Train the final model using the best hyperparameters
final_model = XGBRegressor(**best_params)
final_model.fit(x, y)

pred_train=final_model.predict(x_train)
pred_test= final_model.predict(X_valid)
print('The r2 score_train is: ', r2_score(y_train, pred_train))
print('The r2 score_test is: ', r2_score(y_valid, pred_test))
print('The mean absolute error ', mean_absolute_error(y_valid, pred_test))
print('mean_squared_error: ',mean_squared_error(y_valid,pred_test))
print('root_mean_squared_error: ',np.sqrt(mean_squared_error(y_valid,pred_test)))

In [None]:
# Calculate the cross-validation score with the final model
cv_scores = cross_val_score(final_model, x, y, cv=10)

# Optuna maximizes the objective function, so negate the mean squared error
avg_mse = cv_scores.mean()

print("Cross-validation score :", avg_mse)

In [None]:
sns.regplot(x=y_valid,y=pred_test)
plt.title("Tunned-XGBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()

In [None]:
# Define the parameter grid
params = {
    'n_estimators': [100, 200,500,700],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3,5,7,9],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8,1]
}

xgb_tuned= GridSearchCV(XGBRegressor(),params,scoring='r2',cv =10,n_jobs=-1,verbose=True)
xgb_tuned.fit(x_train,y_train)
y_pred=xgb_tuned.predict(x_test)
print('The r2 score is:', r2_score(y_test, y_pred))
print("\n The best estimator across ALL searched params:\n", xgb_tuned.best_estimator_)
print("\n The best score across ALL searched params:\n", xgb_tuned.best_score_)
print("\n The best parameters across ALL searched params:\n",xgb_tuned.best_params_)

In [None]:
# Reinstating with tuned parameters
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=22)

xgb_model=XGBRegressor(n_estimators=100,max_depth=3,colsample_bytree=1,learning_rate=0.1,subsample=0.8).fit(x_train,y_train)

pred_train=xgb_model.predict(x_train)
pred_test= xgb_model.predict(x_test)
print('The r2 score_train is: ', r2_score(y_train, pred_train))
print('The r2 score_test is: ', r2_score(y_test, pred_test))
print('The mean absolute error ', mean_absolute_error(y_test, pred_test))
print('root_mean_squared_error: ',np.sqrt(mean_squared_error(y_test,pred_test)))

In [None]:
cv = cross_val_score(xgb_model, x,y,cv=10)
print('The cross validation score', cv.mean())

In [None]:
sns.regplot(x=y_test,y=pred_test)
plt.title("Tunned-XGBR")
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
plt.show()