# Task-02: Predicting Finalist Teams and players 

### Here it is assumed that Finalist teams will be Team INDIA and Team South Africa

In [87]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Here, the dataset is clean and does not require additional Data Cleaning.

# Data Preprocessing

Before we move towards modelling, we will need to process the categorical values present in the dataset, for which we will use label encoding.
<br>
We will also create a map between original and encoded values which would help in information retrieval at later stage.



In [193]:
df = pd.read_excel('players_list.xlsx')

In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           255 non-null    object
 1   name           255 non-null    object
 2   team           255 non-null    object
 3   opposition     255 non-null    object
 4   venue          255 non-null    object
 5   player_rating  255 non-null    int64 
 6   selected       255 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 14.1+ KB


Date was used to group and distinguish matches and plays no role in selection of players. Thus we will drop this column.

In [195]:
df.drop(['date'], axis=1, inplace=True)

As we have categorical attributes, we will perform label encoding. And for our reference we are making a map of encoded values to the original values

In [196]:
player_name, team_name, venue_name = df['name'], df['opposition'], df['venue']

In [197]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [198]:
encoding_list = ['name' , 'opposition' , 'venue']
df[encoding_list] = df[encoding_list].apply(lambda col: le.fit_transform(col)+1 )

In [199]:
name_code, team_code, venue_code = df['name'], df['opposition'], df['venue']

In [200]:
name_mapping = {decoded: encoded for decoded, encoded in zip(player_name, name_code)}
team_mapping = {decoded: encoded for decoded, encoded in zip(team_name, team_code)}
venue_mapping = {decoded: encoded for decoded, encoded in zip(venue_name, venue_code)}

In [201]:
print(f"Name mapping: {name_mapping}", end=' ')
print()
print(f"Team mapping: {team_mapping}", end=' ')
print()
print(f"Venue mapping: {venue_mapping}", end=' ')

Name mapping: {'Q de Kock': 19, 'T Bavuma': 28, 'HE van der Dussen': 6, 'AK Markram': 1, 'H Klaasen': 5, 'DA Miller': 3, 'M Jansen': 16, 'L Ngidi': 14, 'K Rabada': 10, 'KA Maharaj': 11, 'G Coetzee': 4, 'LB Williams': 15, 'T Shamsi': 29, 'AL Phehlukwayo': 2, 'RR Hendricks': 23, 'RG Sharma': 22, 'Ishan Kishan': 8, 'V Kohli': 30, 'SS Iyer': 26, 'KL Rahul': 12, 'HH Pandya': 7, 'JJ Bumrah': 9, 'Mohammed Siraj': 18, 'R Ashwin': 20, 'Kuldeep Yadav': 13, 'RA Jadeja': 21, 'Shubman Gill': 27, 'SN Thakur': 25, 'Mohammed Shami': 17, 'SA Yadav': 24} 
Team mapping: {'Sri Lanka': 10, 'Australia': 2, 'Afghanistan': 1, 'Pakistan': 8, 'Netherlands': 6, 'Bangladesh': 3, 'England': 4, 'New Zealand': 7, 'South Africa': 9, 'India': 5} 
Venue mapping: {'Arun Jaitley Stadium, Delhi': 1, 'MA Chidambaram Stadium, Chepauk, Chennai': 5, 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow': 2, 'Narendra Modi Stadium, Ahmedabad': 7, 'Himachal Pradesh Cricket Association Stadium, Dharamsala': 4, '

In [202]:
df['team'] = df['team'].map(team_mapping)

In [203]:
df.sample(7)

Unnamed: 0,name,team,opposition,venue,player_rating,selected
184,5,9,7,6,725,1
129,18,5,7,4,709,1
142,15,9,3,8,144,1
39,21,5,1,1,593,1
47,6,9,2,2,730,1
38,13,5,1,1,661,1
117,2,9,4,8,359,0


## Modelling

As we've got the processed data, we would scale the data using standard scaling.

In [206]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [207]:
from sklearn.model_selection import train_test_split

In [208]:
feature_list = ['name', 'team', 'opposition', 'venue', 'player_rating']

In [209]:
X = df[feature_list]
y = df["selected"]

In [210]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=21)

In [211]:
scaler = scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((204, 5), (51, 5))

We would be using Random Forest Classifer for modelling and for its hyperparameter tuning, we would use Grid Search.

In [212]:
from sklearn.model_selection import GridSearchCV

In [246]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(random_state = 21)
parameters = {'n_estimators': list(range(50, 150, 3)), 'max_depth': list(range(1, 15, 2)), 'max_features': ['sqrt', 'log2']}

RandomForest_Classifier = GridSearchCV(randomForest, parameters, n_jobs=20, verbose=0)
RandomForest_Classifier.fit(X_train_scaled, y_train)

In [247]:
best_model = RandomForest_Classifier.best_estimator_
best_param_RF = RandomForest_Classifier.best_params_
print("Best parameters are " , best_param_RF)

Best parameters are  {'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 65}


In [248]:
y_train_predicted = best_model.predict(X_train_scaled)
y_predicted = best_model.predict(X_test_scaled)

Predicting the accuracy

In [250]:
from sklearn.metrics import accuracy_score

In [251]:
accuracy_score(y_predicted, y_test)

0.803921568627451

Predicting Teams for Finale!

Here, we are creating the test set for the final where we are selecting the 15 indian player squad and 15 south african player squad and removing all the duplicate entries

In [252]:
# India = 5, South Africa = 9
Indian_squad = df[df['team'] == 5][['name', 'player_rating']].drop_duplicates().reset_index(drop=True)
SA_squad = df[df['team'] == 9][['name', 'player_rating']].drop_duplicates().reset_index(drop=True)


In [253]:
Indian_squad

Unnamed: 0,name,player_rating
0,22,739
1,8,565
2,30,770
3,26,643
4,12,623
5,7,465
6,9,654
7,18,709
8,20,310
9,13,661


In [254]:
# Narendra Modi Stadium, Ahmedabad = 7
Indian_squad[['team', 'opposition', 'venue']] = [5, 9, 7]
SA_squad[['team', 'opposition', 'venue']] = [9, 5, 7]


In [255]:
Indian_squad

Unnamed: 0,name,player_rating,team,opposition,venue
0,22,739,5,9,7
1,8,565,5,9,7
2,30,770,5,9,7
3,26,643,5,9,7
4,12,623,5,9,7
5,7,465,5,9,7
6,9,654,5,9,7
7,18,709,5,9,7
8,20,310,5,9,7
9,13,661,5,9,7


In [256]:
india = Indian_squad[['name', 'team', 'opposition', 'venue', 'player_rating']]
south_africa = SA_squad[['name', 'team', 'opposition', 'venue', 'player_rating']]

In [257]:
india

Unnamed: 0,name,team,opposition,venue,player_rating
0,22,5,9,7,739
1,8,5,9,7,565
2,30,5,9,7,770
3,26,5,9,7,643
4,12,5,9,7,623
5,7,5,9,7,465
6,9,5,9,7,654
7,18,5,9,7,709
8,20,5,9,7,310
9,13,5,9,7,661


In [258]:
south_africa

Unnamed: 0,name,team,opposition,venue,player_rating
0,19,9,5,7,771
1,28,9,5,7,624
2,6,9,5,7,730
3,1,9,5,7,646
4,5,9,5,7,725
5,3,9,5,7,649
6,16,9,5,7,563
7,14,9,5,7,570
8,10,9,5,7,629
9,11,9,5,7,694


In [259]:
scaler = StandardScaler()
scaler.fit(india)
scaler.fit(south_africa)
india_scaled = scaler.transform(india)
SA_scaled = scaler.transform(south_africa)

In [262]:
india_pred = best_model.predict(india_scaled)

In [263]:
india_pred

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1], dtype=int64)

In [266]:
south_africa_pred = best_model.predict(SA_scaled)

In [267]:
south_africa_pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0], dtype=int64)

In [268]:
india['selected'] = india_pred.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  india['selected'] = india_pred.tolist()


In [269]:

for nameEnc in india['name']:
  for names,value in name_mapping.items():
    if value==nameEnc:
      india['name'].replace([nameEnc], str(names),inplace=True)
      
team_india = india[india['selected']==1]['name']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  india['name'].replace([nameEnc], str(names),inplace=True)


In [270]:
south_africa['selected'] = south_africa_pred.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  south_africa['selected'] = south_africa_pred.tolist()


In [271]:
count=0
for nameEnc in south_africa['name']:
  for names,value in name_mapping.items():
    if value==nameEnc:
      south_africa['name'].replace([nameEnc], str(names),inplace=True)
team_southafrica = south_africa[south_africa['selected']==1]['name']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  south_africa['name'].replace([nameEnc], str(names),inplace=True)


# The playing eleven of each team for the Finale.

In [272]:
print(f"Playing eleven (India): \n{team_india}")

Playing eleven (India): 
0          RG Sharma
2            V Kohli
3            SS Iyer
4           KL Rahul
6          JJ Bumrah
7     Mohammed Siraj
9      Kuldeep Yadav
10         RA Jadeja
11      Shubman Gill
13    Mohammed Shami
14          SA Yadav
Name: name, dtype: object


In [273]:
print(f"Playing eleven (South Africa):\n{team_southafrica}")

Playing eleven (South Africa):
0             Q de Kock
1              T Bavuma
2     HE van der Dussen
3            AK Markram
4             H Klaasen
5             DA Miller
6              M Jansen
8              K Rabada
9            KA Maharaj
10            G Coetzee
12             T Shamsi
Name: name, dtype: object
