In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


In [16]:
# Sklearn libraries

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression


In [2]:
# import dataset
df = pd.read_csv('train_0OECtn8.csv')

In [3]:
#lets show the head of the dataset
df.head()

Unnamed: 0,row_id,user_id,category_id,video_id,age,gender,profession,followers,views,engagement_score
0,1,19990,37,128,24,Male,Student,180,1000,4.33
1,2,5304,32,132,14,Female,Student,330,714,1.79
2,3,1840,12,24,19,Male,Student,180,138,4.35
3,4,12597,23,112,19,Male,Student,220,613,3.77
4,5,13626,23,112,27,Male,Working Professional,220,613,3.13


In [4]:
# find the shape of the dataset
df.shape

(89197, 10)

In [5]:
# lets check of the dataset has null values
df.isnull().sum()

row_id              0
user_id             0
category_id         0
video_id            0
age                 0
gender              0
profession          0
followers           0
views               0
engagement_score    0
dtype: int64

In [6]:
#Show unique values in the collumn
df.nunique()

row_id              89197
user_id             27734
category_id            47
video_id              175
age                    58
gender                  2
profession              3
followers              17
views                  43
engagement_score      229
dtype: int64

# Data preparation


In [8]:
#print the column names
df.columns

Index(['row_id', 'user_id', 'category_id', 'video_id', 'age', 'gender',
       'profession', 'followers', 'views', 'engagement_score'],
      dtype='object')

In [11]:
# lets drop the unwanted columns And assign the features to  X, target to y
X= df.drop(['row_id', 'user_id', 'category_id', 'video_id','engagement_score'], axis=1)
y = df['engagement_score']


In [17]:
# encoding catagorical feature
encoder = LabelEncoder()

cat_col=['gender', 'profession']

for col in cat_col:
    X[col]=encoder.fit_transform(X[col])
    
X.head()

Unnamed: 0,age,gender,profession,followers,views
0,24,1,1,180,1000
1,14,0,1,330,714
2,19,1,1,180,138
3,19,1,1,220,613
4,27,1,2,220,613


In [18]:
X.nunique()

age           58
gender         2
profession     3
followers     17
views         43
dtype: int64

# Train test spilt

In [19]:
#spliting the dataset into train test
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.33, random_state=42)

# Data Standardization

In [27]:
#lets standardize the dataset

scaler = StandardScaler()


X_train= scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [28]:
X_train


array([[-0.87918513,  0.83962656,  0.14795203, -0.05634614, -1.20491027],
       [-0.87918513, -1.19100568,  0.14795203,  0.8108446 ,  0.69046214],
       [ 1.46787633,  0.83962656, -1.27951994, -0.27314382, -1.0224481 ],
       ...,
       [-1.32624446,  0.83962656,  0.14795203, -0.27314382, -1.0224481 ],
       [-0.87918513, -1.19100568,  0.14795203,  0.37724923,  0.43724932],
       [ 1.3561115 , -1.19100568, -1.27951994,  1.89483302,  0.46331535]])

# Using Linear regression

In [29]:
le = LinearRegression()

le.fit(X_train, y_train)

LinearRegression()

In [30]:
y_train_pred=le.predict(X_train)
y_test_pred=le.predict(X_test)

In [32]:
print("train_score:",r2_score(y_train, y_train_pred))
print("test_score:", r2_score(y_test, y_test_pred))

train_score: 0.22830132666379555
test_score: 0.23145825169563994


# Using XGBoost Regressor

In [35]:
#import xgboost
from xgboost import XGBRegressor

In [36]:
xgb = XGBRegressor(n_estimators= 800, max_depth = 8, learning_rate=0.01)

xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=8, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=800, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [37]:
y_train_pred=xgb.predict(X_train)
y_test_pred=xgb.predict(X_test)

In [38]:
print("train_score:", r2_score(y_train,y_train_pred))
print("test_score:", r2_score(y_test,y_test_pred))

train_score: 0.38857758606944026
test_score: 0.3516978488168968


#  Using RandomForest Regressor

In [40]:
#import randomforest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [41]:
rf = RandomForestRegressor(random_state=42)

param= {
       'max_depth': [4, 5, 8, 10, 12],
       'min_samples_leaf':[50, 100, 150, 200],
       'n_estimators': [300, 500,600,800, 900]
}

random_model_rf = RandomizedSearchCV(estimator = rf, param_distributions = param, cv = 5, verbose=2, random_state=42, n_jobs = -1, return_train_score=True)

In [42]:
random_model_rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
                   n_jobs=-1,
                   param_distributions={'max_depth': [4, 5, 8, 10, 12],
                                        'min_samples_leaf': [50, 100, 150, 200],
                                        'n_estimators': [300, 500, 600, 800,
                                                         900]},
                   random_state=42, return_train_score=True, verbose=2)

In [44]:
best_model= random_model_rf.best_estimator_
best_model

RandomForestRegressor(max_depth=12, min_samples_leaf=50, n_estimators=800,
                      random_state=42)

In [46]:
#prediction
y_train_pred= best_model.predict(X_train)
y_test_pred= best_model.predict(X_test)

In [47]:
#evaluation
print('train_score:', r2_score(y_train, y_train_pred))
print('test_score:', r2_score(y_test, y_test_pred))

train_score: 0.3553492834900065
test_score: 0.34315465534820855


Till now Xgb has the best r2_score .
so now lets predict the score for the test dataset

# Test dataset 
import the test dataset and do the data preprocessing as we done for train dataset

In [49]:
#import test dataset
t_df = pd.read_csv('test_1zqHu22.csv')

#info of  test dataset
print('Shape of the testdataset:', t_df.shape)
print('Columns in testdataset:', t_df.columns)

#data preprocessing
new_t_df= t_df.drop(['row_id', 'user_id', 'category_id', 'video_id'], axis=1)

# encoding the dataset
cat_col=['gender', 'profession']
for col in cat_col:
    new_t_df[col]=encoder.fit_transform(new_t_df[col])
    
# Standardize the data
all_test_col = new_t_df.columns
new_t_df[all_test_col]= scaler.transform(new_t_df[all_test_col])

# predict the engaement score from the xgb model

target = xgb.predict(new_t_df)
print(target[:10])

Shape of the testdataset: (11121, 9)
Columns in testdataset: Index(['row_id', 'user_id', 'category_id', 'video_id', 'age', 'gender',
       'profession', 'followers', 'views'],
      dtype='object')
[4.093889  3.7791603 2.8066874 3.9227989 2.7812016 3.9506576 3.7628167
 3.9249692 2.6174762 4.0590878]


In [50]:
# submission of dataframe
submission_df = pd.DataFrame({'row_id': t_df.row_id.values, 'engagement_score': target})

submission_df

<bound method NDFrame.head of        row_id  engagement_score
0       89198          4.093889
1       89199          3.779160
2       89200          2.806687
3       89201          3.922799
4       89202          2.781202
...       ...               ...
11116  100314          3.918144
11117  100315          3.512344
11118  100316          3.887473
11119  100317          3.726938
11120  100318          3.486180

[11121 rows x 2 columns]>

In [51]:
# save to submission.csv

submission_df.to_csv('submission.csv', index=False)
print("file save succesfully")

file save succesfully
