In [11]:
import altair as alt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Final Report

Create an electronic report in English with a maximum of 2000 words (excluding citations) using Jupyter. The report should include the posed question, conducted analysis, and derived conclusion. Only one team member needs to submit this report. It is not required to include all tasks completed by every group member in their individual assignments; tailor the final report to the collective group's work. Make sure to reach a consensus among all team members on the final content of the report. If needed, consult your TA and Instructor for further guidance.

You must submit 2 files:

- an .html file (File -> Download As -> HTML)
- an .ipynb file. This file must be fully reproducible. It must run completely from top to bottom without any additional files.

## How Many Hours Would a Player Contribute Given Their “Age”?

## Introduction

(0verwr

## Methods and Results

### Linear Regression Model

In [2]:
players_df = pd.read_csv('data/players.csv')
sessions_df = pd.read_csv('data/sessions.csv')

players_df = players_df[['hashedEmail', 'age', 'played_hours', 'experience']].dropna().drop_duplicates()
sessions_df = sessions_df[['hashedEmail', 'start_time', 'end_time']].dropna().drop_duplicates()

players_df.head()

Unnamed: 0,hashedEmail,age,played_hours,experience
0,f6daba428a5e19a3d47574858c13550499be23603422e6...,9,30.3,Pro
1,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,17,3.8,Veteran
2,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,17,0.0,Veteran
3,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,21,0.7,Amateur
4,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,21,0.1,Regular


In [3]:
sessions_df.head()

Unnamed: 0,hashedEmail,start_time,end_time
0,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,30/06/2024 18:12,30/06/2024 18:24
1,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,17/06/2024 23:33,17/06/2024 23:46
2,f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3...,25/07/2024 17:34,25/07/2024 17:57
3,bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431...,25/07/2024 03:22,25/07/2024 03:58
4,36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f5...,25/05/2024 16:01,25/05/2024 16:12


In [4]:
X = players_df[['age']]
y = players_df['played_hours']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,age
5,17
65,21
136,20
97,18
168,17


In [5]:
X_test.head()

Unnamed: 0,age
139,20
113,17
16,17
75,21
154,19


In [7]:
lr_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')), 
            ('scaler', StandardScaler())                 
        ]), ['age'])                                     
    ]
)

lr_pipeline = Pipeline(steps=[
    ('preprocessor', lr_preprocessor),
    ('regressor', LinearRegression())  
])

In [8]:
lr_pipeline.fit(X_train, y_train)

In [9]:
lr_train_predictions = lr_pipeline.predict(X_train)
lr_test_predictions = lr_pipeline.predict(X_test)

In [12]:
lr_train_rmse = root_mean_squared_error(y_train, lr_train_predictions)
lr_test_rmse = root_mean_squared_error(y_test, lr_test_predictions)

In [13]:
print("Linear Regression Training RMSE:", lr_train_rmse)
print("Linear Regression Test RMSE:", lr_test_rmse)

Linear Regression Training RMSE: 31.43450788424238
Linear Regression Test RMSE: 7.934433438305193


In [16]:
lr_players_predictions_train = pd.DataFrame({
    'Actual': y_train,
    'Predicted': lr_train_predictions
})
lr_players_predictions_test = pd.DataFrame({
    'Actual': y_test,
    'Predicted': lr_test_predictions
})

lr_players_predictions_combined = pd.concat([lr_players_predictions_train, lr_players_predictions_test])
lr_players_predictions_combined.reset_index(drop=True, inplace=True)

lr_players_predictions_combined.head()

Unnamed: 0,Actual,Predicted
0,0.0,7.700531
1,0.1,7.034538
2,0.0,7.201036
3,0.1,7.534033
4,0.1,7.700531
