# Importing the required stuff

In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt 

import numpy as np

import datetime as dt

: 

In [None]:
# To remove warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Encoders
from sklearn.preprocessing import OneHotEncoder     # Nominal Categorical variables
from sklearn.preprocessing import StandardScaler     # Numerical variables

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import RandomOverSampler

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    log_loss
)

In [None]:
df = pd.read_csv('/data/kaggle-preprocessed.csv')

# Understanding our dataset

In [None]:
# Checking the first five rows
df.head()

In [None]:
# Checking the last five rows
df.tail()

In [None]:
df.info

In [None]:
df.shape

# Cleaning the data

In [None]:
# Checking for null values
df.isnull().sum()

In [None]:
# Checking for duplicated rows
df.duplicated().sum()

Basic cleaning has been done.  
There are no missing or duplicated values, we can proceed to the next step.

In [None]:
### Cleaning column names
df.columns = df.columns.str.lower()
df.columns

# Feature Engineering

### Creating Date feature

In [None]:
# Converting date to datetime format
df['date'] = pd.to_datetime(df['date'])

# Getting year and month columns
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

### Extracting 'hour' from 'time' column.

In [None]:
# converting time into hours
df['time'] = pd.to_datetime(df['time'])
df['time'] = df['time'].dt.hour

In [None]:
# Dropping the date column
df.drop(columns='date', inplace=True)

### Standardizing the units of the 'size' column

In [None]:
# Checking all the storage format
df['size'].str[-2:].unique()

In [None]:
# Define the conversion factors for different units
units = {'KB': 1e-3, 'MB': 1, 'GB': 1e3, 'B': 1e-6, 'K': 1e-3}  # 'K' added for cases like '491 K'

# Function to clean and convert file sizes to MB
def convert_size(size_str):
    
    # Clean the string: remove spaces, standardize case
    size_str = size_str.strip().replace(' ', '').upper() 
    
    for unit in units:
        if unit in size_str:
            try:
                return float(size_str.replace(unit, '')) * units[unit]
            except ValueError:
                return 0  # In case of any unexpected formatting errors
    return 0  # Default to 0 if no valid unit is found

In [None]:
# Read the large CSV file in chunks and process each chunk
chunk_size = 10000
df_list = []  # List to store processed chunks

# Loop to read data in chunks
for start in range(0, len(df), chunk_size):
    end = min(start + chunk_size, len(df))  # Calculate the end of the chunk
    chunk = df.iloc[start:end]  # Slice the DataFrame to create a chunk
    
    # processing in chunks 
    chunk['size_in_mb'] = chunk['size'].apply(convert_size)
    
    # appending the processed chunk
    df_list.append(chunk)

# Concatenate all processed chunks into a final DataFrame
df = pd.concat(df_list)

print("Processing complete. The final dataset has been saved as 'processed_large_dataset.csv'.")

In [None]:
# Dropping size column
df.drop(columns='size', inplace=True)

# EDA - Exploratory Data Analysis

In [None]:
palette='rocket'

In [None]:
# Checking the years of the datasets.
list = df['year'].unique()
print(np.sort(list))

* > our dataset contains datasets uploaded between the years 2016 and 2023

In [None]:
sns.countplot(x=df['medals'], palette=palette)
plt.title('Number of medals')

* > Most datasets have Bronze medal, followed by No medal.

In [None]:
sns.countplot(x=df['medals'], hue=df['day'], palette=palette)
plt.title('Medal Distribution acc. to days')

* > We can see that most of the Gold medal datasets were uploaded on Thursdays.
* > And most Silvers on Monday.

In [None]:
sns.histplot(data=df['usability'])

* > The usability scores are negatively skewed.

In [None]:
sns.barplot(x=df['medals'], y=df['size_in_mb'], palette=palette)
plt.title('Medals acc to size in mb')

# Feature Selection

In [None]:
# Droppping unecessary columns
df.drop(columns=['unnamed: 0', 'dataset_name', 'author_name', 'author_id', 'dataset_link'], inplace=True)

In [None]:
# getting column names
df.columns

# Pre-Processing

## Encoding & Scaling the features

In [None]:
# Getting categorical columns
categorical_features = df.select_dtypes(include='object').columns
numerical_features = [item for item in df.columns if item not in categorical_features]

In [None]:
# Encoding Nominal Features
# Nominal features - have an order of ranking

# Manual Mapping
df['medals'] = df['medals'].map({'No Medal': 0, 'Bronze': 1, 'Silver': 2, 'Gold': 3})

# We don't use Label ENcoder as it maps the data in alphabetical order.
# We will not have complete control.
# But manual mapping gives me control to assign highest value to Gold and the least value to No Medal

In [None]:
# Dropping medals cos we already encoded it.
categorical_features = df[categorical_features].drop(columns='medals').columns

In [None]:
# Encoding the catogorical nominal features
encoder = OneHotEncoder(sparse=False)

for feature in categorical_features:
    df[feature] = encoder.fit_transform(df[[feature]])

    
# 'sparse=1': saves memory by only storing non-zero values

In [None]:
# Scaling numeric features
encoder = StandardScaler()

for feature in numerical_features:
    df[feature] = encoder.fit_transform(df[[feature]])

## Train-Test Split

In [None]:
Y = df['medals']
X = df.drop(columns='medals')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=66)

## Oversampling

In [None]:
# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=60)

# Fit and resample
X_resampled, y_resampled = ros.fit_resample(x_train, y_train)

In [None]:
# CHecking the number of samples taken
values, count = np.unique(y_resampled, return_counts=True)

In [None]:
print(f'Unique values of medals: {values}')
print(f'No. of samples taken from each: {values}')

# Modelling

## Initialising Model

In [None]:
model = RandomForestClassifier(random_state=60)

## Training the model

In [None]:
#model.fit(X_resampled, y_resampled)
model.fit(x_train, y_train)

## Predicting

In [None]:
y_pred = model.predict(x_test)

## Model Evaluation

In [None]:
metrics = {'precision': precision_score,
           'recall':recall_score}

In [None]:
for name, metric in metrics.items():
    print(name, ' : ', metric(y_test, y_pred, average='weighted'))

## Understanding the Model

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
feature_importances = model.feature_importances_

In [None]:
feature_names = x_train.columns

In [None]:
plt.barh(feature_names, feature_importances)
plt.xlabel("Feature Importance")
plt.title("Feature Importance for Random Forest")
plt.show()