In this notebook ,a Random Forest Classifier is used .A Baseline model is trained and scored using the validation data and then hyperparameter tuning is carried out using RandomizedSearchCV to find the optimal parameters and then feature selection with the aid of Boruta .Data preprocessing is carried out on both train and test data .These steps are shown below 

In [None]:
# importing libraries and train data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
train_data = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Tinder_Millennial_Match/train_set_label.csv")
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1896 entries, 0 to 1895
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        1896 non-null   float64
 1   Segment type              1896 non-null   object 
 2   Segment Description       1896 non-null   object 
 3   Answer                    1896 non-null   object 
 4   Count                     1896 non-null   float64
 5   Percentage                1896 non-null   float64
 6   It became a relationship  1896 non-null   int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 103.8+ KB


In [None]:
train_data

Unnamed: 0,ID,Segment type,Segment Description,Answer,Count,Percentage,It became a relationship
0,292890.8970,web,"Meridian, Idaho",No,0.000000,0.000000,0
1,292887.9870,web,"Meridian, Idaho",No,0.000000,0.000000,0
2,292894.0656,gender,"Meridian, Idaho",No,499.173606,0.225255,0
3,292887.1180,web,"Meridian, Idaho",No,0.000000,0.000000,0
4,292893.6561,gender,"Meridian, Idaho",No,455.925963,0.211360,0
...,...,...,...,...,...,...,...
1891,292887.5496,web,"Meridian, Idaho",No,0.000000,0.000000,0
1892,292881.6932,mobile,"Meridian, Idaho",No,1203.190399,0.312360,0
1893,292900.8499,gender,"Meridian, Idaho",No,806.378820,0.488025,0
1894,292893.8600,gender,"Meridian, Idaho",No,1149.529381,0.488984,0


In [None]:
# inspection of some columns in data frame 
train_data['Segment Description'].value_counts()

Meridian, Idaho               1421
Westport, CT                    14
University of Pennsylvania      12
University of Mississippi       10
University of Washington        10
                              ... 
University of Iowa               1
Randolph-Macon College           1
Texas Tech University            1
Tulane University                1
University of Alabama            1
Name: Segment Description, Length: 148, dtype: int64

In [None]:
# simple imputation is done for missing values in numerical columns 
num_cols = train_data.select_dtypes(include=np.number).columns      # getting all the numerical columns

train_data[num_cols] = train_data[num_cols].fillna(train_data[num_cols].mean())  

In [None]:
# fill missing values if any in categorical columns 
cat_cols = train_data.select_dtypes(include = 'object').columns    # getting all the categorical columns

train_data[cat_cols] = train_data[cat_cols].fillna(train_data[cat_cols].mode().iloc[0])  # fills the mi

In [None]:
# a copy of train data is made for 'safety'
a=train_data.copy()

In [None]:
# establishing dummy variables 
a = pd.get_dummies(a, columns = cat_cols) 
print(a.columns)


Index(['ID', 'Count', 'Percentage', 'It became a relationship',
       'Segment type_gender', 'Segment type_mobile', 'Segment type_university',
       'Segment type_web', 'Segment Description_Appalachian State University',
       'Segment Description_Arizona State University',
       ...
       'Segment Description_Washington University in St. Louis',
       'Segment Description_Web-based respondents',
       'Segment Description_Wesleyan University',
       'Segment Description_Westport, CT',
       'Segment Description_Whatsgoodly University',
       'Segment Description_Williams College',
       'Segment Description_Yale University', 'Answer_I don't use Tinder',
       'Answer_No', 'Answer_Yes'],
      dtype='object', length=159)


In [None]:
# separating variables
X=a.drop(['It became a relationship'],axis=1)
y=a['It became a relationship']

In [None]:
# scaling with StandardScaler
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X)
scaled_features = pd.DataFrame(data=scaled_features)
scaled_features.columns= X.columns

In [None]:
# splitting train data into train and validation sets using scikit learn train_test split 
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=1)

In [None]:
# fitting the baseline model 
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=0)
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
# computing accuracy for baseline model
pred=model.predict(X_valid)
from sklearn.metrics import accuracy_score
accuracy_score(y_valid,pred) 

0.9

In [None]:
#Hyperparameter tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
# Detecting the best parameters
model=RandomForestClassifier(random_state=0)
parameters = {
    'max_features' : ["auto", "sqrt", "log2"],
    'min_samples_split' : np.linspace(0.1, 1.0, 10),
    'max_depth' : [x for x in range(1,20)],
    'min_samples_leaf':np.linspace(0.1,0.5,10)}

rs = RandomizedSearchCV(model, parameters, cv=5,n_iter=20)
rs.fit(X_train, y_train)

print_results(rs)

BEST PARAMS: {'min_samples_split': 0.4, 'min_samples_leaf': 0.1, 'max_features': 'sqrt', 'max_depth': 13}

0.65 (+/-0.001) for {'min_samples_split': 0.4, 'min_samples_leaf': 0.1, 'max_features': 'sqrt', 'max_depth': 13}
0.65 (+/-0.001) for {'min_samples_split': 1.0, 'min_samples_leaf': 0.18888888888888888, 'max_features': 'auto', 'max_depth': 4}
0.65 (+/-0.001) for {'min_samples_split': 0.4, 'min_samples_leaf': 0.2777777777777778, 'max_features': 'log2', 'max_depth': 9}
0.65 (+/-0.001) for {'min_samples_split': 0.9, 'min_samples_leaf': 0.4111111111111111, 'max_features': 'log2', 'max_depth': 15}
0.65 (+/-0.001) for {'min_samples_split': 0.4, 'min_samples_leaf': 0.5, 'max_features': 'sqrt', 'max_depth': 7}
0.65 (+/-0.001) for {'min_samples_split': 1.0, 'min_samples_leaf': 0.23333333333333334, 'max_features': 'sqrt', 'max_depth': 13}
0.65 (+/-0.001) for {'min_samples_split': 0.30000000000000004, 'min_samples_leaf': 0.32222222222222224, 'max_features': 'sqrt', 'max_depth': 10}
0.65 (+/-0.

In [None]:
!pip install Boruta



Collecting Boruta
[?25l  Downloading https://files.pythonhosted.org/packages/b2/11/583f4eac99d802c79af9217e1eff56027742a69e6c866b295cce6a5a8fc2/Boruta-0.3-py3-none-any.whl (56kB)
[K     |█████▉                          | 10kB 16.8MB/s eta 0:00:01[K     |███████████▋                    | 20kB 22.2MB/s eta 0:00:01[K     |█████████████████▍              | 30kB 26.5MB/s eta 0:00:01[K     |███████████████████████▏        | 40kB 30.1MB/s eta 0:00:01[K     |█████████████████████████████   | 51kB 4.4MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.2MB/s 
Installing collected packages: Boruta
Successfully installed Boruta-0.3


In [None]:
#feature selection with boruta
from sklearn.feature_selection import SelectFromModel
from boruta import BorutaPy
# define Boruta feature selection method
# boruta_selector = BorutaPy(model, n_estimators='auto', verbose=2)
boruta_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)

# find all relevant features
boruta_selector.fit(np.array(X_train), np.array(y_train))

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	158
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	158
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	158
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	158
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	158
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	158
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	158
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	5
Rejected: 	153
Iteration: 	9 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	153
Iteration: 	10 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	153
Iteration: 	11 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	153
Iteration: 	12 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	153
Iteration: 	13 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	153
Iteration: 	14 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	153
Iteration: 	15 / 100
Confirmed: 	4
Tentative: 	1
Rejected: 	153
Iteration: 	16 / 100
Confirmed: 	4
Tentative: 	1


BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight=None, criterion='gini',
                                          max_depth=None, max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=31, n_jobs=None,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x7F1F90469048,
                                          verbose=0, warm_start=False),
         max_iter=100, n_estimators=

In [None]:
#Selecting the important the features
X_important_train = boruta_selector.transform(np.array(X_train))
model1=RandomForestClassifier(min_samples_split= 0.4,min_samples_leaf= 0.1,max_features= 'sqrt',max_depth=13,random_state=0)
model1.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=13, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.1, min_samples_split=0.4,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
# Evaluating the accuracy for the transformed model
X_important_valid = boruta_selector.transform(np.array(X_valid))
pred=model1.predict(X_important_valid)
from sklearn.metrics import accuracy_score
accuracy_score(y_valid,pred)

0.7605263157894737

In [None]:
# importing test data
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Tinder_Millennial_Match/test_set_label.csv')

In [None]:
num_cols = test_data.select_dtypes(include=np.number).columns      # getting all the numerical columns

test_data[num_cols] = test_data[num_cols].fillna(test_data[num_cols].mean())

In [None]:
cat_cols = test_data.select_dtypes(include = 'object').columns    # getting all the categorical columns

test_data[cat_cols] = test_data[cat_cols].fillna(test_data[cat_cols].mode().iloc[0]) 

In [None]:
# getting dummy variables 
c=test_data.copy()
c = pd.get_dummies(c, columns = cat_cols)
c

Unnamed: 0,ID,Count,Percentage,Segment type_gender,Segment type_mobile,Segment type_university,Segment type_web,Segment Description_Appalachian State University,Segment Description_Arkansas State University,Segment Description_Auburn University,Segment Description_Baylor University,Segment Description_Boston College,Segment Description_Brigham Young University,Segment Description_Bucknell University,Segment Description_Carnegie Mellon University,Segment Description_Case Western Reserve University,Segment Description_Castleton State College,Segment Description_Centre College,Segment Description_Clemson University,Segment Description_College of William and Mary,Segment Description_Columbia University,Segment Description_Duke University,Segment Description_Elon University,Segment Description_Emory University,Segment Description_Florida State University,Segment Description_Georgetown University,Segment Description_Gonzaga University,Segment Description_Harvard University,Segment Description_Illinois State,Segment Description_Indiana University,Segment Description_Iowa State University,Segment Description_James Madison University,Segment Description_Lehigh University,Segment Description_Louisiana State University,Segment Description_Loyola University Maryland,Segment Description_Marquette University,"Segment Description_Meridian, Idaho",Segment Description_Miami University (Ohio),Segment Description_Michigan State University,Segment Description_Michigan Technological University,...,Segment Description_University of Colorado Boulder,Segment Description_University of Dayton,Segment Description_University of Florida,Segment Description_University of Georgia,Segment Description_University of Illinois,Segment Description_University of Iowa,Segment Description_University of Kansas,Segment Description_University of Kentucky,Segment Description_University of Miami,Segment Description_University of Michigan,Segment Description_University of Mississippi,Segment Description_University of Missouri,Segment Description_University of North Carolina,Segment Description_University of Notre Dame,Segment Description_University of Oklahoma,Segment Description_University of Oregon,Segment Description_University of Pennsylvania,Segment Description_University of Pittsburgh,Segment Description_University of Rochester,Segment Description_University of Southern California,Segment Description_University of St Andrews,Segment Description_University of Tampa,Segment Description_University of Texas,Segment Description_University of Virginia,Segment Description_University of Washington,"Segment Description_University of Wisconsin, Eau Claire",Segment Description_University of Wisconsin-La Crosse,Segment Description_University of Wisconsin-Madison,Segment Description_University of Wisconsin-Whitewater,Segment Description_Vanderbilt University,Segment Description_Vassar College,Segment Description_Washington State University,Segment Description_Washington University in St. Louis,Segment Description_Wesleyan University,"Segment Description_Westport, CT",Segment Description_Whatsgoodly University,Segment Description_Yale University,Answer_I don't use Tinder,Answer_No,Answer_Yes
0,292974.2311,0.230742,0.115371,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,292889.7996,0.000000,0.000000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,293724.0350,0.000000,0.000000,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,292884.0724,1497.699982,0.388752,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,292882.0964,1439.464988,0.373646,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627,293085.4757,1.000000,0.743904,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
628,292890.5771,0.000000,0.000000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
629,292891.5922,1104.827490,0.411368,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
630,292889.9699,0.000000,0.000000,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [None]:
#the creation of dummy variables will cause a disparity in columns in test and train data .The cell below resolves that 

In [None]:
actual_x_input_for_prediction = pd.DataFrame()
default = 0

for column in a.columns :
            if column in c:
                actual_x_input_for_prediction[column] = c[column]
            else :
                actual_x_input_for_prediction[column] = [default] * len( c)

In [None]:
# now the test data is subjected to the other conditions that teh train was submitted to 

In [None]:
actual_x_input_for_prediction=actual_x_input_for_prediction.drop(['It became a relationship'],axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(actual_x_input_for_prediction)
scaled_features = pd.DataFrame(data=scaled_features)
scaled_features.columns= actual_x_input_for_prediction.columns

In [None]:
actual_x_input_for_prediction_important = boruta_selector.transform(np.array(actual_x_input_for_prediction))

In [None]:
target=model1.predict(actual_x_input_for_prediction_important)

In [None]:
res = pd.DataFrame(target) #target is nothing but the final predictions of your model on input features of your new unseen test data
res.columns = ["prediction"]
res.to_csv("submissiondatasprint-24.csv",index =False) 

In [None]:
# checking the predictions 
u=pd.read_csv("submissiondatasprint-24.csv")
u

Unnamed: 0,prediction
0,0
1,0
2,0
3,1
4,1
...,...
627,0
628,0
629,0
630,0
