In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
sub = pd.read_csv('../input/scrabble-player-rating/sample_submission.csv')

In [None]:
train = pd.read_csv('../input/scrabble-player-rating/train.csv')
train = train.rename(columns=str.lower)
print(f'Shape of train: {train.shape}')

In [None]:
test = pd.read_csv('../input/scrabble-player-rating/test.csv')
test  = test.rename(columns=str.lower)
print(f'Shape of test: {test.shape}')

In [None]:
turns = pd.read_csv('../input/scrabble-player-rating/turns.csv')
turns = turns.rename(columns=str.lower)
print(f'Shape of turns: {turns.shape}')

In [None]:
games = pd.read_csv('../input/scrabble-player-rating/games.csv')
games = games.rename(columns=str.lower)
print(f'Shape of games: {games.shape}')

In [None]:
brief_df = pd.concat([train, test], axis=0)
brief_df = brief_df.sort_values(["game_id"])
bots = ["BetterBot", "STEEBot", "HastyBot"]


user_df = brief_df[~brief_df["nickname"].isin(bots)] 
user_df = user_df.rename(
    columns={"nickname": "user_name", "score": "user_score", "rating": "user_rating"}
) 
bot_df = brief_df[brief_df["nickname"].isin(bots)]

bot_df = bot_df.rename(
    columns={"nickname": "bot_name", "score": "bot_score", "rating": "bot_rating"}
)

main_df = pd.merge(user_df, bot_df, on="game_id")
main_df.head()

In [None]:
main_df["user_freq"] = main_df.groupby("user_name")["user_name"].transform("count")
encode_bots = LabelEncoder()
main_df["bot_name"] = encode_bots.fit_transform(main_df["bot_name"])
main_df.head()

In [None]:
missing_cols = main_df.columns[main_df.isnull().any()].tolist()
print(f'These lables have missing data that needs to be cleaned: {missing_cols} ')

In [None]:
train_df = main_df[~main_df['user_rating'].isna()].reset_index(drop=True)
train_df.head()

In [None]:
test_df  = main_df[main_df['user_rating'].isna()].reset_index(drop=True)
test_df.head()

In [None]:
#The Best 30 Scrabblers by Nickname
top_score = train.sort_values(by='score', ascending=False)[:30]
figure = plt.figure(figsize=(10,6))
sns.barplot(x=top_score.nickname, y=top_score.score)
plt.xticks()
plt.ylabel('Scrabble Scores')
plt.xlabel('Competitor Nickname')
plt.title('Scrabble Competitors by Scores')
plt.show()

In [None]:
corr = games.corr(method='pearson')
sns.heatmap(corr)

In [None]:
games["rating_mode"].value_counts().plot.bar(figsize=(8, 6), color=['#808080', '#C0C0C0'], title='Scrabble Rating Mode');

In [None]:
model_dict = {
    "linear": LinearRegression(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision_tree": DecisionTreeRegressor(),
    "random_forest": RandomForestRegressor(),
    "gradient_boosting": GradientBoostingRegressor(),
    "neural_network": MLPRegressor(),
    "lgb": lgb.LGBMRegressor(),
}

In [None]:
def get_scores(model_dict, X, y, nfolds=5):
    """
    This function computes the cross-validated R^2 and RMSE scores
    for each model in model_dict on the provided training data X and y.

    Args:
        model_dict (dict): A dictionary containing the models to be evaluated, with keys as model names and values as the initialized model objects.
        X (pandas.DataFrame): The training data on which to evaluate the models.
        y (pandas.DataFrame): The target variable for the training data.
        nfolds (int, optional): The number of folds to use for cross-validation. Defaults to 5.

    Returns:
        pandas.DataFrame: A dataframe containing the mean R^2 and RMSE scores for each model, computed using cross-validation.
    """
    df_score_details = {
        "model": [],
        "(R2)": [],
        "(RMSE)": [],
        "(MAE)": [],
    }  
    for model_key in model_dict.keys():
        val_r2_scores = []
        val_rmse_scores = []
        val_mae_scores = []
        kf = KFold(n_splits=nfolds)
        start = time.time()
        # nfolds
        for train_index, val_index in kf.split(X):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]

            # model
            model_cls = model_dict[model_key] # model
            model = model_cls
            model.fit(X_train, y_train) # X_train
            # validation_data model
            val_preds = model.predict(X_val).reshape(-1) # X_val
            val_r2_scores.append(r2_score(y_val, val_preds)) 
            val_rmse_scores.append(mean_squared_error(y_val, val_preds, squared=False)) # RMSE
            val_mae_scores.append(mean_absolute_error(y_val, val_preds)) # MAE
        df_score_details["model"].append(model_key)
        df_score_details["(R2)"].append(np.mean(val_r2_scores))
        df_score_details["(RMSE)"].append(np.mean(val_rmse_scores))
        df_score_details["(MAE)"].append(np.mean(val_mae_scores))
        elapsed_time = time.time() - start
        print("-------------------------")
        print(f"model{model_key}: {df_score_details}")
        print(f"{model_key} finished in {elapsed_time:.2f} seconds")        
        print("-------------------------")
    df_score = pd.DataFrame(df_score_details)
    return df_score

In [None]:
X_train = train_df.drop(["user_name", "user_rating"], axis=1)
y_train = train_df["user_rating"].copy()
X_test = test_df.drop(["user_name", "user_rating"], axis=1)
# CV
df_score = get_scores(model_dict, X_train, y_train, nfolds=2)

In [None]:
df_score.sort_values("(RMSE)")

In [3]:
import pickle
model = RandomForestRegressor()
model.fit(X_train, y_train)
test_df["user_rating"] = model.predict(
    test_df.drop(["user_name", "user_rating"], axis=1)
)
final_sub = test_df[["game_id", "user_rating"]]
final_sub = final_sub.rename(columns={"user_rating": "rating"})
output = open('myfile.pkl', 'wb')
pickle.dump(final_sub,output)
final_sub.to_csv("submission.csv", index=False)

NameError: name 'X_train' is not defined