In [215]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 **Step1: Identifying the problem**

- <b>Objective: </b>In this challenge, we have a list of users along with information of their activity on website such as date of account created, time when user first active on the website, the country for which user has done booking etc. We also got have some personal information about each user. Our task is to build a machine learning model which will predict which country a new user's first booking destination will be.


- <b>Data: </b>Following are the features present in training dataset:
    - id: user id
    - date_account_created: the date of account creation
    - timestamp_first_active: timestamp of the first activity, note that it can be earlier than date_account_created or date_first_booking because a user can search before signing up
    - date_first_booking: date of first booking
    - gender
    - age
    - signup_method: whether user has signup from website or by using facebook, gmail etc.
    - signup_flow: the page a user came to signup up from
    - language: international language preference
    - affiliate_channel: what kind of paid marketing
    - affiliate_provider: where the marketing is e.g. google, craigslist, other
    - first_affiliate_tracked: whats the first marketing the user interacted with before the signing up
    - signup_app
    - first_device_type
    - first_browser
    - country_destination: this is the target variable. There are 12 possible outcomes of the destination country: 'US', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL','DE', 'AU', 'NDF' (no destination found), and 'other'. 
    <br>Note: 
        - 'other' means there was a booking, but is to a country not included in the list
        - 'NDF' means there wasn't destination found = no booking.


- There other 3 files given along with train and test dataset.
    1. sessions.csv - this file contain all web sessions log for each user
    2. countries.csv - summary statistics of destination countries in this dataset and their locations
    3. age_gender_bkts.csv - summary statistics of users' age group, gender, country of destination

Reading the data given

In [216]:
train = pd.read_csv("/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip")
test = pd.read_csv("/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv.zip")
countries=pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/countries.csv.zip')
sample=pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sample_submission_NDF.csv.zip')

**Step 2: Data Exploration**

In [217]:
test.head()
train.head()

In [218]:
train.describe()

In [219]:
train.isna().sum()

In [220]:
train.info()

In [221]:
countries.describe()

In [222]:
countries.head()

In [223]:
sample.describe()

In [224]:
sample.info()

In [225]:
print (pd.value_counts(train['country_destination'].values))

In [226]:
print (pd.value_counts(train['gender'].values))

In [227]:
print (pd.value_counts(train['language'].values))

In [228]:

# remove duplicates from the original dataframe if any
before_dup = len(train)
train.drop_duplicates(inplace=True)
after_dup = len(train)
print(before_dup - after_dup, 'duplicates removed')

**some data cleaning**

In [229]:
train["date_account_created"] = pd.to_datetime( train["date_account_created"])
train["date_first_booking"] = pd.to_datetime( train["date_first_booking"])
train["timestamp_first_active"] = pd.to_datetime(train["timestamp_first_active"], format="%Y%m%d%H%M%S")
# Create columns for the year, month and day the account was created and first booking occured so it can be input in the model

train['month_FB'] = train['date_first_booking'].dt.month
train['Year_Created']=train['date_account_created'].dt.year


train.head()

In [230]:
plt.figure(figsize=(12, 6))
sns.distplot(train.age.dropna())

In [231]:
# putting the age instead of year of birth in the age coloumn
for i in train.index:
    if (train['age'][i] >= 1920):
        train['age'][i]= train['Year_Created'][i] -train['age'][i]
    

In [232]:
train.describe()

In [233]:
#Plotting the histogram/PDF for age<=100
plt.figure(figsize=(12, 6))
age_hundred=train.loc[train['age'].apply(lambda x:x<=100)]['age']
sns.distplot(age_hundred,hist=True)
sns.despine()

In [234]:
ax = sns.boxplot(x=train["age"])

In [235]:
plt.figure(figsize=(12, 6))
train = train[np.abs(train['age']- train ['age'].mean()) <= (3*train['age'].std())]
ax = sns.boxplot(x=train["age"])

In [236]:
plt.figure(figsize=(12, 6))
index = train[(train['age'] >= 70)|(train['age'] <= 5)].index
train.drop(index, inplace=True)
Mean_age = train['age'].mean()
train['age'] = train['age'].fillna(Mean_age)
ax = sns.boxplot(x=train["age"])

**Data visualiztaion**

see if gender have a good relation with sign up method used

In [237]:
plt.figure(figsize=(12, 6))
train.gender.value_counts(dropna=False).plot(kind='bar')

In [238]:
sns.catplot(x='signup_method', col='gender', kind='count', data=train);

In [239]:
plt.figure(figsize=(12, 6))
train['country_destination'].value_counts().plot(kind='bar')
plt.xlabel('countries sums')
plt.ylabel('Frequency')

In [240]:

plt.figure(figsize=(12, 6))
train['month_FB'].value_counts().plot(kind='bar')
plt.title('frequencies of reservations / number of month')
plt.xlabel(' number of month')
plt.ylabel('Frequency')
plt.show()

In [241]:
plt.figure(figsize=(12, 6))
train['language'].value_counts().plot(kind='bar')
plt.title('frequencies of reservations / languages')
plt.xlabel('langue')
plt.ylabel('Frequency')

In [242]:
plt.figure(figsize=(18, 19))
sns.countplot(x='country_destination', hue='gender', data=train)
plt.title('# of Travellers to countries and their genders')
plt.show()

In [243]:
plt.figure(figsize=(18, 9))
sns.countplot(x='country_destination', hue='signup_method', data=train)
plt.title('# of Travellers to countries and their signup method')
plt.show()

In [244]:
plt.figure(figsize=(18,9))
sns.countplot(x='country_destination', data=train, hue='signup_app')
plt.ylabel('Number of users')
plt.title('Signup_App vs. country destination')
plt.legend(loc = 'upper right')
plt.show()

In [245]:
plt.figure(figsize=(18,9))
sns.countplot(x='country_destination', data=train, hue='first_device_type')
plt.ylabel('Number of users')
plt.title('first device type vs. country destination')
plt.legend(loc = 'upper right')
plt.show()

In [246]:
plt.figure(figsize=(18,9))
sns.countplot(x='country_destination', data=train, hue='affiliate_channel')
plt.ylabel('Number of users')
plt.title('Affiliate_Channel vs. country destination')
plt.legend(loc = 'upper right')
plt.show()

In [247]:
plt.figure(figsize=(18, 9))
sns.countplot(x='country_destination', hue='month_FB', data=train)
plt.title('# of Travellers to each country by month')
plt.show()

In [248]:
plt.figure(figsize=(18,9))
sns.countplot(x='first_affiliate_tracked',data=train,hue='country_destination')
plt.title('# of first affiliate tracked by countries')
plt.show()

In [249]:
plt.figure(figsize=(12, 6))
sns.countplot(x='country_destination', hue='signup_method', data=train[train['country_destination']=='US'])
plt.title('# of Travellers to USA')
plt.show()

In [250]:
plt.figure(figsize=(12, 6))
sns.countplot(x='country_destination', hue='gender', data=train[train['country_destination']=='US'])
plt.title('# of Travellers to USA')
plt.show()

In [251]:
plt.figure(figsize=(12, 6))
sns.countplot(x='country_destination', hue='language', data=train[train['country_destination']=='US'])
plt.title('# of Travellers to USA')
plt.show()

In [252]:
plt.figure(figsize=(12, 6))
sns.countplot(x='country_destination', hue='signup_method', data=train[train['country_destination']=='NDF'])
plt.title('# of Travellers to USA')
plt.show()

In [253]:
plt.figure(figsize=(12, 6))
sns.countplot(x='country_destination', hue='gender', data=train[train['country_destination']=='NDF'])
plt.title('# of Travellers to NDF')
plt.show()

In [254]:
plt.figure(figsize=(18, 9))
sns.countplot(x='country_destination', hue='language', data=train[train['country_destination']=='NDF'])
plt.title('# of Travellers to NDF')
plt.show()

**Preprocessing**

In [255]:
sex = {'MALE': 0,'FEMALE': 1, 'OTHER': 2, '-unknown-':3}
train.gender = [sex[item] for item in train.gender]
#test.gender = [sex[item] for item in test.gender]


In [256]:
train.drop(['id','date_first_booking', 'date_account_created', 'timestamp_first_active','signup_app', 'first_browser','first_device_type','first_affiliate_tracked', 'signup_flow','affiliate_provider'],axis=1,inplace=True)
print(train)

**the features that will help the model**

In [257]:
Features = ['gender', 'language','signup_method','affiliate_channel','month_FB']

In [258]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

lable encoder to transform string data into numerical

In [259]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train[Features] = train[Features].apply(lambda col: le.fit_transform(col))

In [260]:
Features_trials = ['gender', 'language','signup_method','affiliate_channel','month_FB','country_destination','age']
plt.figure(figsize=(12, 12))
sns.set(font_scale=1.25)
sns.heatmap(train.corr(), linewidths=1.5, annot=True, square=True, 
                fmt='.2f', annot_kws={'size': 10}, 
                yticklabels=Features_trials , xticklabels=Features_trials
            )
plt.yticks(rotation=0)
plt.show()

In [261]:
train.head()

In [262]:
x=train[Features]
y=train['country_destination']

**downsampling**

In [263]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = train[train.country_destination=='NDF']
df_minority = train[train.country_destination=='US']

 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=49,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
train = pd.concat([df_majority_downsampled, df_minority])
 
train.info()

**spliting train data into test and train data**

In [264]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

**inserting random forest**

In [265]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

**Applying Random Forest and calculating score**

In [266]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [267]:
rfc.fit(x_train,y_train)
Y_pred =rfc.predict(x_test)
score = rfc.score(x_test, y_test)
print(score*100)