In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = "/kaggle/input/projetdm-data/TRAIN.CSV"
### Loop the data lines
with open(train_data, 'r') as temp_f:
    # get No of columns in each line
    col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

### Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
column_names = [i for i in range(0, max(col_count))]
df_train = pd.read_csv(train_data,header=None, delimiter=",", names=column_names, low_memory=False).astype(str)


In [None]:
import re
actions_to_count = ["0","1","2"]
hotkeys_to_count = ["0","1","2","3","4","5","6","7","8","9"]

In [None]:
def extract_features(input_data):
    output_data = []
    for row in input_data:
        count_actions = [0] * len(actions_to_count)
        count_hotkeys = [0] * len(hotkeys_to_count)
        new_row = []
        new_row.append(row[0])
        new_row.append(row[1])

        for action in row[2:]:
            # count 0, 1, 2 at the end of "hotkey"
            for index, substring in enumerate(actions_to_count):
                if (action.startswith('hotkey') and action.endswith(substring)):
                    count_actions[index] += 1

            # count hotkey
            for index, substring in enumerate(hotkeys_to_count):
                if (action.startswith('hotkey') and action[-2] == substring):
                    count_hotkeys[index] += 1

        for i in range(len(count_actions)):
            new_row.append(count_actions[i])
        for i in range(len(count_hotkeys)):
            new_row.append(count_hotkeys[i])
        output_data.append(new_row)
    return output_data
    

In [None]:
converted_train_data = df_train.values
output_train_data = extract_features(converted_train_data)

In [None]:
# Convert the table to a DataFrame with headers
headers = ['url', 'race', 'number_hotkey_created', 'number_hotkey_update', 'number_hotkey_used',
          'number_key0','number_key1', 'number_key2', 'number_key3', 'number_key4', 'number_key5', 'number_key6',
          'number_key7', 'number_key8', 'number_key9']
new_df = pd.DataFrame(output_train_data, columns=headers)
new_df.head()

In [None]:
# One-hot encode the specified columns
new_df = pd.get_dummies(new_df, columns=['race'])

# Display the encoded DataFrame
print(new_df)


In [None]:
# Train/ Test split
from sklearn.model_selection import train_test_split
X = new_df.iloc[:, 1:]  # Features (game information)
y = new_df.iloc[:, 0]   # Target variable (player's URL)

# Split the data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)  # You can adjust the test_size


In [None]:
# Random Forest model training
# Initialize the Random Forest model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)  # You can adjust the number of trees (n_estimators)

# For regression problems, use RandomForestRegressor and set the appropriate parameters

# Train the model
model.fit(X_train, y_train)

# Predict using the test set
predictions = model.predict(X_valid)

# Evaluate the model (if needed)
# For classification problems:
accuracy = model.score(X_valid, y_valid)
print("Accuracy:", accuracy)


In [None]:
# Test data preparation

test_data = "/kaggle/input/projetdm-data/TEST.CSV"
### Loop the data lines
with open(test_data, 'r') as temp_f:
    # get No of columns in each line
    col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

### Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
column_names = [i for i in range(0, max(col_count))]
df_test = pd.read_csv(test_data,header=None, delimiter=",", names=column_names, low_memory=False).astype(str)
df_test.insert(0, '0', 0)
df_test.head()

In [None]:
converted_test_data = df_test.values
output_test_data = extract_features(converted_test_data)


In [None]:
new_test_df = pd.DataFrame(output_test_data, columns=headers)
new_test_df = pd.get_dummies(new_test_df, columns=['race'])
X_test = new_test_df.iloc[:, 1:]  # Features (game information)
predictions = model.predict(X_test)
print(predictions)

In [None]:
# Convert predictions to a DataFrame
submission_df = pd.DataFrame({'prediction': predictions})
submission_df['RowID'] = submission_df.index + 1 # Adding row IDs starting from 0

# Reorder the columns with 'RowID' as the first column
submission_df = submission_df[['RowID', 'prediction']]


# Save the DataFrame to a CSV file named 'submissions.csv'
submission_df.to_csv('submissions.csv', index=False)  