<a href="https://colab.research.google.com/github/AyonChatterjee/ML-Projects-/blob/main/OlympicsPredictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing all the necessary libraries

In [215]:
import pandas as pd
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.linear_model import LinearRegression , Ridge , Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error , r2_score


In [216]:
#Loading the dataset

data = pd.read_csv("teams.csv")
data.shape

(2144, 11)

In [217]:
# Checking for missing values 
missing_in_column = data[data['prev_medals'].isnull()]
print(missing_in_column)


     team                           country  year  events  athletes   age  \
19    ALB                           Albania  1992       8         9  25.3   
26    ALG                           Algeria  1964       7         7  26.0   
39    AND                           Andorra  1976       2         3  28.3   
50    ANG                            Angola  1980      14        17  17.4   
59    ANT               Antigua and Barbuda  1976      11        17  23.2   
...   ...                               ...   ...     ...       ...   ...   
2092  VIN  Saint Vincent and the Grenadines  1988       6         6  20.5   
2103  YAR                       North Yemen  1984       3         3  27.7   
2105  YEM                             Yemen  1992       8         8  19.6   
2112  YMD                       South Yemen  1988       5         5  23.6   
2120  ZAM                            Zambia  1964      13        15  21.7   

      height  weight  medals  prev_medals  prev_3_medals  
19     163.0    

In [222]:
X = data.drop("medals" , axis = 1)
y = data["medals"]

In [223]:
data_cleaned = data.dropna()


In [None]:
#imputer = SimpleImputer(strategy='mean')  
#X_imputed = imputer.fit_transform(X)

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'AFG'

In [224]:
# Train-test split (80% train , 20% test)
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)
print(f"Training set size : {X_train.shape} , Testing set size : {X_test.shape}")

Training set size : (1715, 10) , Testing set size : (429, 10)


In [225]:
# Checking which columns have missing values
missing_values =  X_train.isnull().any()
print(missing_values)


team             False
country          False
year             False
events           False
athletes         False
age              False
height           False
weight           False
prev_medals       True
prev_3_medals     True
dtype: bool


In [226]:
# Encoding for categorical features 
categorical_columns = ['team' ,  'country']
print(X_train[categorical_columns].head())
encoder = OneHotEncoder(sparse_output=False , handle_unknown='ignore')

     team   country
1970  TUN   Tunisia
163   BAR  Barbados
203   BER   Bermuda
840   HAI     Haiti
994   ITA     Italy


In [227]:
#Fit and transform on training data 
X_train_encoded = encoder.fit_transform(X_train[categorical_columns])

In [228]:
#Transforming on testing data
X_test_encoded = encoder.transform(X_test[categorical_columns])

In [229]:
# Converting the encoded features to dataframes
X_train_encoded_df = pd.DataFrame(X_train_encoded , columns=encoder.get_feature_names_out(categorical_columns))
X_test_encoded_df = pd.DataFrame(X_test_encoded , columns=encoder.get_feature_names_out(categorical_columns))



In [230]:
X_train_encoded_df.head

<bound method NDFrame.head of       team_AFG  team_AHO  team_ALB  team_ALG  team_AND  team_ANG  team_ANT  \
0          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
2          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
3          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
4          0.0       0.0       0.0       0.0       0.0       0.0       0.0   
...        ...       ...       ...       ...       ...       ...       ...   
1710       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1711       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1712       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1713       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1714       0.0       0.0       0.0       0.0       0.0       0.0       0.0   

      team_ARG  team_ARM  team_AR

In [231]:
X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)
X_train.drop(categorical_columns, axis=1, inplace=True)
X_test.drop(categorical_columns, axis=1, inplace=True)

In [232]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
