In [12]:
%cd '/content/drive/MyDrive/Codeforce-ratings-prediction'

/content/drive/MyDrive/Codeforce-ratings-prediction


# Importing Libraries

In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
pd.options.plotting.backend = "matplotlib"
import category_encoders as ce
from statsmodels.stats.outliers_influence import variance_inflation_factor

import textwrap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# Dataset Loaded

In [14]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,userid,rating,contest1,contest2,contest3,contest4,contest5,contest6,contest7,contest8,contest9,contest10
0,3143927301,2115,2078,2055,2115,2047.0,2024.0,2010.0,1953.0,1936.0,2042.0,2045.0
1,1876577621,2254,2194,2114,2152,2179.0,2211.0,2154.0,2170.0,2141.0,2157.0,2209.0
2,6397741793,2344,2120,2206,2147,2234.0,2294.0,2090.0,2089.0,2072.0,2085.0,2114.0
3,3090123616,2224,2224,2222,2166,2116.0,2029.0,2113.0,2104.0,2096.0,2115.0,2163.0
4,9564162806,2128,2128,2120,2072,2018.0,1963.0,2039.0,1932.0,1963.0,1960.0,1886.0


# Basic Data Exploration

In [15]:
df.shape

(1423, 12)

In [16]:
df.dtypes

userid         int64
rating         int64
contest1       int64
contest2       int64
contest3       int64
contest4     float64
contest5     float64
contest6     float64
contest7     float64
contest8     float64
contest9     float64
contest10    float64
dtype: object

In [17]:
df.isnull().sum()

userid         0
rating         0
contest1       0
contest2       0
contest3       0
contest4       1
contest5      19
contest6      49
contest7      80
contest8     120
contest9     144
contest10    171
dtype: int64

***There are lots of missing values specially from contest 7 to 10.***

In [19]:
df.duplicated().sum()

0

***There are no duplicated values.***

In [21]:
df.drop(columns=['userid'], inplace=True)

In [23]:
df.describe()

Unnamed: 0,rating,contest1,contest2,contest3,contest4,contest5,contest6,contest7,contest8,contest9,contest10
count,1423.0,1423.0,1423.0,1423.0,1422.0,1404.0,1374.0,1343.0,1303.0,1279.0,1252.0
mean,2291.403373,2225.56922,2182.663387,2144.55376,2096.829114,2062.668803,2044.155022,2032.603127,2032.438987,2018.510555,2006.584665
std,258.730165,228.588617,247.831336,277.342857,324.225868,358.430332,375.658912,388.340663,374.310185,385.680233,397.97175
min,1054.0,1029.0,1001.0,838.0,683.0,570.0,427.0,439.0,388.0,453.0,354.0
25%,2121.0,2077.0,2022.5,1984.0,1940.25,1913.75,1894.25,1884.5,1877.5,1855.0,1843.75
50%,2224.0,2155.0,2121.0,2103.0,2080.0,2069.5,2045.5,2044.0,2041.0,2033.0,2014.0
75%,2396.5,2314.0,2288.0,2278.5,2248.0,2246.25,2231.75,2221.5,2213.0,2201.5,2205.25
max,3833.0,3572.0,3697.0,3569.0,3487.0,3644.0,3833.0,3724.0,3727.0,3783.0,3813.0


# Data Viz

In [27]:
fig = make_subplots(rows=3, cols=4, subplot_titles=df.columns)

fig.add_trace(go.Histogram(x=df['rating'], name='rating'), row=1, col=1)
fig.add_trace(go.Histogram(x=df['contest1'],  name='contest1'), row=1, col=2)
fig.add_trace(go.Histogram(x=df['contest2'], name='contest2'), row=1, col=3)
fig.add_trace(go.Histogram(x=df['contest3'], name='contest3'), row=1, col=4)
fig.add_trace(go.Histogram(x=df['contest4'], name='contest4'), row=2, col=1)
fig.add_trace(go.Histogram(x=df['contest5'], name='contest5'), row=2, col=2)
fig.add_trace(go.Histogram(x=df['contest6'], name='contest6'), row=2, col=3)
fig.add_trace(go.Histogram(x=df['contest7'], name='contest7'), row=2, col=4)
fig.add_trace(go.Histogram(x=df['contest8'], name='contest8'), row=3, col=1)
fig.add_trace(go.Histogram(x=df['contest9'], name='contest9'), row=3, col=2)
fig.add_trace(go.Histogram(x=df['contest10'], name='contest10'), row=3, col=3)

# Update layout
fig.update_layout(title='Individual Displots of Eleven Columns', showlegend=False)
fig.update_xaxes(title_text='Value', row=3, col=2)
fig.update_yaxes(title_text='Probability Density', row=2, col=1)

# Show plot
fig.show()


In [28]:
fig = make_subplots(rows=3, cols=4, subplot_titles=df.columns)

fig.add_trace(go.Box(x=df['rating'], name='rating'), row=1, col=1)
fig.add_trace(go.Box(x=df['contest1'],  name='contest1'), row=1, col=2)
fig.add_trace(go.Box(x=df['contest2'], name='contest2'), row=1, col=3)
fig.add_trace(go.Box(x=df['contest3'], name='contest3'), row=1, col=4)
fig.add_trace(go.Box(x=df['contest4'], name='contest4'), row=2, col=1)
fig.add_trace(go.Box(x=df['contest5'], name='contest5'), row=2, col=2)
fig.add_trace(go.Box(x=df['contest6'], name='contest6'), row=2, col=3)
fig.add_trace(go.Box(x=df['contest7'], name='contest7'), row=2, col=4)
fig.add_trace(go.Box(x=df['contest8'], name='contest8'), row=3, col=1)
fig.add_trace(go.Box(x=df['contest9'], name='contest9'), row=3, col=2)
fig.add_trace(go.Box(x=df['contest10'], name='contest10'), row=3, col=3)

# Update layout
fig.update_layout(title='Individual Box Plots of Eleven Columns', showlegend=False)

# Rotate y-axis labels
fig.update_yaxes(tickangle=90)

# Show plot
fig.show()


In [38]:
fig = px.scatter_matrix(df)
fig.update_traces(diagonal_visible=False)
fig.update_layout(height=900)
fig.update_traces(diagonal_visible=False)
fig.show()