In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load and preprocess the dataset
data = pd.read_csv('../data/train_user.csv')
data['gender'] = data['gender'].map({'MALE': 0, 'FEMALE': 1})  # Convert gender to numerical
data['age'].fillna(data['age'].mean(), inplace=True)  # Impute missing values in age

# Convert country_destination to numerical labels (Label Encoding)
data['country_destination'] = data['country_destination'].astype('category')
data['country_code'] = data['country_destination'].cat.codes

# Split the data into features and target
features = data[['age', 'gender']]
target = data['country_code']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

from sklearn.impute import SimpleImputer

# Fill missing values in age with the mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize and train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_imputed, y_train)

# Predict using the trained model
y_pred = clf.predict(X_test_imputed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.71


In [16]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load and preprocess the test data
test_data = pd.read_csv("../data/test_user.csv")

In [17]:
test_data

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62091,cv0na2lf5a,2014-09-30,20140930235232,,-unknown-,31.0,basic,0,en,direct,direct,untracked,Web,Windows Desktop,IE
62092,zp8xfonng8,2014-09-30,20140930235306,,-unknown-,,basic,23,ko,direct,direct,untracked,Android,Android Phone,-unknown-
62093,fa6260ziny,2014-09-30,20140930235408,,-unknown-,,basic,0,de,direct,direct,linked,Web,Windows Desktop,Firefox
62094,87k0fy4ugm,2014-09-30,20140930235430,,-unknown-,,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari


In [18]:
# Impute missing values in age with the mean
imputer = SimpleImputer(strategy='mean')
test_data['age'].fillna(test_data['age'].mean(), inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
test_data['gender'] = label_encoder.fit_transform(test_data['gender'])

# Select features for prediction
test_features = test_data[['age', 'gender']]

# Predict using the trained model
test_predictions = clf.predict(test_features)

# Convert numerical predictions back to country codes
test_data['predicted_country'] = pd.Categorical.from_codes(test_predictions, data['country_destination'].cat.categories)



In [19]:
test_data

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,1,35.000000,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,0,37.616677,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,0,37.616677,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,0,37.616677,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,0,37.616677,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62091,cv0na2lf5a,2014-09-30,20140930235232,,0,31.000000,basic,0,en,direct,direct,untracked,Web,Windows Desktop,IE
62092,zp8xfonng8,2014-09-30,20140930235306,,0,37.616677,basic,23,ko,direct,direct,untracked,Android,Android Phone,-unknown-
62093,fa6260ziny,2014-09-30,20140930235408,,0,37.616677,basic,0,de,direct,direct,linked,Web,Windows Desktop,Firefox
62094,87k0fy4ugm,2014-09-30,20140930235430,,0,37.616677,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari


In [20]:
# Prepare submission file
submission = test_data[['id', 'predicted_country']]
submission.to_csv('../data/submission2.csv', index=False)

KeyError: "['predicted_country'] not in index"