# Data Loading and Exploration

In [132]:
import pandas as pd

# Load the train_users dataset
train_users = pd.read_csv('../data/train_user.csv')

# Explore the dataset
print(train_users.head())
print(train_users.info())

           id date_account_created timestamp_first_active date_first_booking  \
0  4ft3gnwmtx           2010-09-28             2009-06-09         2010-08-02   
1  bjjt8pjhuk           2011-12-05             2009-10-31         2012-09-08   
2  87mebub9p4           2010-09-14             2009-12-08         2010-02-18   
3  lsw9q7uk0j           2010-01-02             2010-01-02         2010-01-05   
4  0d01nltbrs           2010-01-03             2010-01-03         2010-01-13   

      gender  age signup_method  signup_flow language affiliate_channel  \
0     FEMALE   56         basic            3       en            direct   
1     FEMALE   42      facebook            0       en            direct   
2  -unknown-   41         basic            0       en            direct   
3     FEMALE   46         basic            0       en             other   
4     FEMALE   47         basic            0       en            direct   

  affiliate_provider first_affiliate_tracked signup_app first_device

# Data Preprocessing

In [133]:
# Drop rows with missing values in 'age' and 'gender'
train_users = train_users.dropna(subset=['age', 'gender','signup_method','signup_flow'])

# Convert gender to numerical values (e.g., 0 for 'MALE', 1 for 'FEMALE')
train_users['gender'] = train_users['gender'].map({'MALE': 0, 'FEMALE': 1})
train_users['signup_method'] = train_users['signup_method'].map({'basic': 0, 'facebook': 1,'google': 2})

# Split data into features (X) and target (y)
X = train_users[['age', 'gender','signup_method','signup_flow']]
# 'affiliate_channel','affiliate_provider','first_affiliate_tracked','signup_app','first_device_type','first_browser'
y = train_users['country_destination']

# Feature Scaling

In [134]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model Training

In [135]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Check for missing values in the numpy arrays
print("Missing values in X_train:", np.isnan(X_train).sum())
print("Missing values in X_test:", np.isnan(X_test).sum())

from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_imputed, y_train)

Missing values in X_train: 9299
Missing values in X_test: 2337


# Model Evaluation

In [136]:
# Predict using the trained model
X_test_imputed = imputer.transform(X_test)  # Impute missing values in X_test
y_pred = model.predict(X_test_imputed)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.7101189140677586


In [137]:
#### Let's assume you have a new user's age and gender
# new_user_features = [[30,1]]  # Age: 30, Gender: Female
new_user_features = [[30,1,0,5]]  # Age: 30, Gender: Female

# Standardize the new user's features
new_user_features_scaled = scaler.transform(new_user_features)

# Predict the country destination for the new user
predicted_destination = model.predict(new_user_features_scaled)[0]

print('Predicted Country Destination:', predicted_destination)

Predicted Country Destination: US


