In [1]:
import pandas as pd
df1 = pd.DataFrame(pd.read_csv("./data/pubgdata.csv"))
df1.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [2]:

rows_to_delete = range(3400000)

# Drop the rows
df1.drop(index=rows_to_delete, inplace=True)

# Reset index if needed
df1.reset_index(drop=True, inplace=True)
df1.to_csv('data/reducedata.csv', index=False)

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046966 entries, 0 to 1046965
Data columns (total 29 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Id               1046966 non-null  object 
 1   groupId          1046966 non-null  object 
 2   matchId          1046966 non-null  object 
 3   assists          1046966 non-null  int64  
 4   boosts           1046966 non-null  int64  
 5   damageDealt      1046966 non-null  float64
 6   DBNOs            1046966 non-null  int64  
 7   headshotKills    1046966 non-null  int64  
 8   heals            1046966 non-null  int64  
 9   killPlace        1046966 non-null  int64  
 10  killPoints       1046966 non-null  int64  
 11  kills            1046966 non-null  int64  
 12  killStreaks      1046966 non-null  int64  
 13  longestKill      1046966 non-null  float64
 14  matchDuration    1046966 non-null  int64  
 15  matchType        1046966 non-null  object 
 16  maxPlace         1

In [4]:
df1.dropna(inplace=True)

In [5]:
# Drop columns by index
columns_to_drop = [0,1,2,21,23,24]  # Indices of columns to drop
df1.drop(df1.columns[columns_to_drop], axis=1, inplace=True)

In [6]:
x=df1.drop(columns=['winPlacePerc'],axis=1)
y=df1['winPlacePerc']
print(x)
print(y)

         assists  boosts  damageDealt  DBNOs  headshotKills  heals  killPlace  \
0              0       0        74.13      0              0      0         39   
1              1       9       476.40      3              0     10          3   
2              0       2       196.00      0              0      2         14   
3              0       0         0.00      0              0      0         94   
4              0       1        68.72      0              0      3         52   
...          ...     ...          ...    ...            ...    ...        ...   
1046961        0       0         0.00      0              0      0         74   
1046962        0       1        44.15      0              0      0         69   
1046963        0       0        59.06      0              0      0         66   
1046964        0       4       180.40      1              1      2         11   
1046965        0       2       268.00      0              0      1         18   

         killPoints  kills 

Define ordinal which columns should be ordinal encoded and which should be scaled

In [7]:
numerical_columns=x.select_dtypes(exclude='object').columns
categorical_columns=x.select_dtypes(include='object').columns

In [8]:
numerical_columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'revives',
       'rideDistance', 'swimDistance', 'walkDistance', 'weaponsAcquired',
       'winPoints'],
      dtype='object')

In [9]:
categorical_columns

Index(['matchType'], dtype='object')

We need simple imputer to fill the missing values and standard scaler to do feature scaling of numerical values

In [10]:
type_categories = ['squad','normal-squad','normal-squad-fpp','squad-fpp','duo','normal-duo','normal-duo-fpp','duo-fpp','flarefpp','flaretpp','crashfpp','crashtpp','solo','normal-solo','normal-solo-fpp','solo-fpp']

In [11]:
from sklearn.impute import SimpleImputer # Missing values
from sklearn.preprocessing import StandardScaler # Feature scaling
from sklearn.preprocessing import OrdinalEncoder #Encoding Categorical Variables
# Pipeline
from sklearn.pipeline import Pipeline #To add everything together 
from sklearn.compose import ColumnTransformer # Combine everything together

In [12]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("Ordinalencoder",OrdinalEncoder(categories=[type_categories])),
        ("scaler",StandardScaler())
    ]
)
preprocessor=ColumnTransformer(
    [
        ("numericalpipeline",num_pipeline,numerical_columns),
        ("categoricalpipeline",cat_pipeline,categorical_columns)
    ]
)

Combine both the pipelines

In [13]:
preprocessor = ColumnTransformer([
    ('num_pipe', num_pipeline, numerical_columns), 
    ('cat_pipe', cat_pipeline, categorical_columns)
])

Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 42)

In [15]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns=preprocessor.get_feature_names_out()) 
x_test = pd.DataFrame(preprocessor.transform(x_test), columns = preprocessor.get_feature_names_out())

In [16]:
preprocessor.get_feature_names_out()

array(['num_pipe__assists', 'num_pipe__boosts', 'num_pipe__damageDealt',
       'num_pipe__DBNOs', 'num_pipe__headshotKills', 'num_pipe__heals',
       'num_pipe__killPlace', 'num_pipe__killPoints', 'num_pipe__kills',
       'num_pipe__killStreaks', 'num_pipe__longestKill',
       'num_pipe__matchDuration', 'num_pipe__maxPlace',
       'num_pipe__numGroups', 'num_pipe__rankPoints', 'num_pipe__revives',
       'num_pipe__rideDistance', 'num_pipe__swimDistance',
       'num_pipe__walkDistance', 'num_pipe__weaponsAcquired',
       'num_pipe__winPoints', 'cat_pipe__matchType'], dtype=object)

In [17]:
x_train.head()

Unnamed: 0,num_pipe__assists,num_pipe__boosts,num_pipe__damageDealt,num_pipe__DBNOs,num_pipe__headshotKills,num_pipe__heals,num_pipe__killPlace,num_pipe__killPoints,num_pipe__kills,num_pipe__killStreaks,...,num_pipe__maxPlace,num_pipe__numGroups,num_pipe__rankPoints,num_pipe__revives,num_pipe__rideDistance,num_pipe__swimDistance,num_pipe__walkDistance,num_pipe__weaponsAcquired,num_pipe__winPoints,cat_pipe__matchType
0,-0.397745,-0.64549,-0.767354,-0.574792,-0.379393,-0.51013,1.287448,0.992998,-0.595026,-0.764859,...,2.120996,2.105307,-1.212754,-0.348766,-0.40359,-0.147673,-0.914354,-1.088875,1.220616,2.146745
1,1.302627,-0.64549,0.329124,-0.574792,-0.379393,-0.51013,0.55948,1.298949,-0.595026,-0.764859,...,-0.734386,-0.68707,-1.211396,-0.348766,-0.40359,-0.147673,-0.11961,-0.268696,1.180056,-0.530202
2,-0.397745,-0.64549,-0.767354,-0.574792,-0.379393,-0.51013,1.760627,1.479014,-0.595026,-0.764859,...,2.246969,2.320105,-1.212754,-0.348766,-0.40359,-0.147673,-0.954182,-1.088875,1.246304,2.146745
3,-0.397745,-0.64549,-0.767354,-0.574792,-0.379393,-0.51013,0.632277,1.649518,-0.595026,-0.764859,...,-0.90235,-0.901868,-1.212754,-0.348766,-0.40359,-0.147673,-0.873723,-0.268696,1.212504,-0.530202
4,1.302627,-0.64549,0.215654,1.173751,-0.379393,-0.137137,-0.496073,1.36747,0.049337,0.642216,...,-0.776377,-0.73003,-1.212754,-0.348766,-0.40359,-0.147673,-0.780706,-0.678786,1.188168,-0.530202


In [18]:
x_test.head()

Unnamed: 0,num_pipe__assists,num_pipe__boosts,num_pipe__damageDealt,num_pipe__DBNOs,num_pipe__headshotKills,num_pipe__heals,num_pipe__killPlace,num_pipe__killPoints,num_pipe__kills,num_pipe__killStreaks,...,num_pipe__maxPlace,num_pipe__numGroups,num_pipe__rankPoints,num_pipe__revives,num_pipe__rideDistance,num_pipe__swimDistance,num_pipe__walkDistance,num_pipe__weaponsAcquired,num_pipe__winPoints,cat_pipe__matchType
0,-0.397745,-0.64549,-0.767354,-0.574792,-0.379393,-0.51013,1.396643,1.010526,-0.595026,-0.764859,...,0.105432,0.129163,-1.212754,-0.348766,-0.40359,-0.147673,-0.820111,-1.088875,1.224672,-0.307123
1,1.302627,1.688142,-0.180077,0.299479,-0.379393,1.354832,-1.551627,-0.804466,1.338064,0.642216,...,-0.776377,-0.772989,0.77112,3.874621,2.140887,-0.147673,1.78385,0.961572,-0.819562,-0.530202
2,-0.397745,-0.64549,-0.767354,-0.574792,-0.379393,-0.51013,0.086301,0.78903,-0.595026,-0.764859,...,0.189414,0.215083,-1.212754,-0.348766,-0.40359,-0.147673,0.348856,0.551482,1.208448,0.362114
3,-0.397745,0.521326,1.411492,0.299479,1.291545,0.235855,-1.369635,0.977063,1.338064,2.049291,...,0.231405,0.215083,-1.212754,-0.348766,-0.40359,-0.147673,-0.34674,0.961572,1.127327,0.362114
4,1.302627,0.521326,1.151041,1.173751,2.962483,-0.137137,-1.369635,-0.804466,1.338064,0.642216,...,-0.650404,-0.64411,0.654342,-0.348766,-0.40359,-0.147673,-0.089169,0.141393,-0.819562,-0.530202


In [19]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error

In [20]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [21]:
regression.coef_

array([ 0.00836526,  0.02445756,  0.01077467, -0.00626533,  0.00101633,
        0.00180717, -0.19363887, -0.02362559, -0.02403537, -0.09530242,
        0.00028081, -0.04207323, -0.15122199,  0.1966598 ,  0.08624484,
        0.00668579,  0.02642404,  0.00317132,  0.13539429,  0.02750804,
        0.10907332, -0.02016102])

In [22]:
regression.intercept_

0.4726694574317191

In [23]:
import numpy as np
def model_evaluation(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

Training multiple models

In [24]:
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'DecisionTreeRegressor': DecisionTreeRegressor()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    #Make Predictions
    y_pred=model.predict(x_test)

    mae, rmse, r2_square=model_evaluation(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)
    print("Adjusted R2 score",1 - ((1 - r2_square) * (len(y_test) - 1))/(len(y_test) - x_test.shape[1] - 1))

    r2_list.append(r2_square)
    
    print('*'*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 0.12588953584809717
MAE: 0.09256156852349201
R2 score 83.19230257893672
Adjusted R2 score 0.8319053648266138
***********************************


Lasso
Model Training Performance
RMSE: 0.3070688987953651
MAE: 0.2675589048548206
R2 score -0.00014266022936304523
Adjusted R2 score -0.00010650338649620394
***********************************


Ridge
Model Training Performance
RMSE: 0.12588955186704717
MAE: 0.09256153992786353
R2 score 83.1922983015092
Adjusted R2 score 0.8319053220478441
***********************************


Elasticnet
Model Training Performance
RMSE: 0.3070688987953651
MAE: 0.2675589048548206
R2 score -0.00014266022936304523
Adjusted R2 score -0.00010650338649620394
***********************************


DecisionTreeRegressor
Model Training Performance
RMSE: 0.11896989280142363
MAE: 0.08300274124377968
R2 score 84.98922563053097
Adjusted R2 score 0.84987648348882
***********************************


