In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load and preprocess the dataset
data = pd.read_csv('../data/train_user.csv')
data['gender'] = data['gender'].map({'MALE': 0, 'FEMALE': 1})  # Convert gender to numerical
data['age'].fillna(data['age'].mean(), inplace=True)  # Impute missing values in age

# Convert country_destination to numerical labels (Label Encoding)
data['country_destination'] = data['country_destination'].astype('category')
data['country_code'] = data['country_destination'].cat.codes

# Split the data into features and target
features = data[['age', 'gender']]
target = data['country_code']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

from sklearn.impute import SimpleImputer

# Fill missing values in age with the mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize and train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_imputed, y_train)

# Predict using the trained model
y_pred = clf.predict(X_test_imputed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.71


In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load and preprocess the test data
test_data = pd.read_csv("../data/test_user.csv")

# Impute missing values in age with the mean
imputer = SimpleImputer(strategy='mean')
test_data['age'].fillna(test_data['age'].mean(), inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
test_data['gender'] = label_encoder.fit_transform(test_data['gender'])

# Select features for prediction
test_features = test_data[['age', 'gender']]

# Predict using the trained model
test_predictions = clf.predict(test_features)

# Convert numerical predictions back to country codes
test_data['predicted_country'] = pd.Categorical.from_codes(test_predictions, data['country_destination'].cat.categories)

# Prepare submission file
submission = test_data[['id', 'predicted_country']]
submission.to_csv('submission.csv', index=False)

