In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from datetime import date
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")

In [3]:
train.head(10)

Unnamed: 0,Deal_title,Lead_name,Industry,Deal_value,Weighted_amount,Date_of_creation,Pitch,Contact_no,Lead_revenue,Fund_category,...,Designation,Lead_POC_email,Hiring_candidate_role,Lead_source,Level_of_meeting,Last_lead_update,Internal_POC,Resource,Internal_rating,Success_probability
0,TitleM5DZY,"Davis, Perkins and Bishop Inc",Restaurants,320506$,2067263.7$,2020-03-29,Product_2,607.447.7883,50 - 100 Million,Category 2,...,Executive Vice President,charlenewerner@davis.com,Community pharmacist,Website,Level 3,No track,"Davis,Sharrice A",,3,73.6
1,TitleKIW18,Bender PLC LLC,Construction Services,39488$,240876.8$,2019-07-10,Product_2,892-938-9493,500 Million - 1 Billion,Category 4,...,Chairman/CEO/President,terrylogan@bender.com,Recruitment consultant,Others,Level 1,Did not hear back after Level 1,"Brown,Maxine A",No,5,58.9
2,TitleFXSDN,Carter-Henry and Sons,Hospitals/Clinics,359392$,2407926.4$,2019-07-27,Product_1,538.748.2271,500 Million - 1 Billion,Category 4,...,SVP/General Counsel,arielhamilton@carterhenry.com,Health service manager,Marketing Event,Level 1,?,"Georgakopoulos,Vasilios T",No,4,68.8
3,TitlePSK4Y,Garcia Ltd Ltd,Real Estate,76774$,468321.4$,2021-01-30,Product_2,(692)052-1389x75188,500 Million - 1 Billion,Category 3,...,CEO/Co-Founder/Chairman,erinwilson@garcia.com,"Therapist, speech and language",Contact Email,Level 2,Did not hear back after Level 1,"Brown,Maxine A",We have all the requirements,1,64.5
4,Title904GV,Lee and Sons PLC,Financial Services,483896$,,2019-05-22,Product_2,001-878-814-6134x015,50 - 100 Million,Category 3,...,Executive Vice President,mr.christopher@lee.com,Media planner,Website,Level 2,Up-to-date,"Thomas,Lori E",No,4,62.4
5,Title00VOR,Chavez Ltd Inc,Banks,418674$,2637646.2$,2019-06-30,Product_1,(418)259-9934x952,50 - 100 Million,Category 2,...,CEO/Co-Founder/Chairman,crystalchavez@chavez.com,Microbiologist,Marketing Event,Level 3,2 days back,"Featherstone,Adrian R",Deliverable,3,66.3
6,TitleOZQRY,Williamson LLC and Sons,Banks,384356$,2709709.8$,2019-11-20,Product_2,(881)077-4692,500 Million - 1 Billion,Category 1,...,CEO/Co-Founder/Chairman,saradixon@williamson.com,Cartographer,Contact Email,Level 3,More than 2 weeks,"Booker,David L",,4,73.4
7,TitleV05WV,"Livingston, York and Adams Group",Architecture/Engineering,245205$,1642873.5$,2020-12-22,Product_2,953.762.9149,500 Million - 1 Billion,Category 1,...,Vice President / GM (04-present) : VP Sales an...,christianstanley@livingston.com,"Engineer, maintenance (IT)",Others,Level 2,5 days back,"Cashin,Marc C",No,5,74.1
8,TitleC2RNN,Powers Ltd Inc,Education/Training,343280$,1991024.0$,2019-01-08,Product_1,(566)194-6345,100 - 500 Million,Category 3,...,Chairman/Chief Innovation Officer,josephthompson@powers.com,Product/process development scientist,Contact Email,Level 1,No track,"Van Arter,Derrick",We have all the requirements,2,61.4
9,TitleDZT40,Shea Group PLC,REIT,293611$,1791027.1$,2020-07-04,Product_1,(292)819-5746,500 Million - 1 Billion,Category 3,...,CEO,williamgrimes@shea.com,Engineering geologist,Contact Email,Level 1,more than a month,"Hanyok,John J",Deliverable,4,66.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7007 entries, 0 to 7006
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Deal_title             7007 non-null   object 
 1   Lead_name              7007 non-null   object 
 2   Industry               7006 non-null   object 
 3   Deal_value             6956 non-null   object 
 4   Weighted_amount        6482 non-null   object 
 5   Date_of_creation       7007 non-null   object 
 6   Pitch                  7007 non-null   object 
 7   Contact_no             7007 non-null   object 
 8   Lead_revenue           7007 non-null   object 
 9   Fund_category          7007 non-null   object 
 10  Geography              6035 non-null   object 
 11  Location               6996 non-null   object 
 12  POC_name               6999 non-null   object 
 13  Designation            7007 non-null   object 
 14  Lead_POC_email         7007 non-null   object 
 15  Hiri

# Clean train data

In [5]:
train = train.drop(['Deal_title', 'Lead_name', 'Contact_no', 'Lead_POC_email'], axis = 1)

In [6]:
train['Date_of_creation'] = pd.to_datetime(train['Date_of_creation'])
train['days_since_created'] = train['Date_of_creation'].apply(lambda x:(pd.Timestamp('today') - x).days)
train.drop(['Date_of_creation'], inplace=True, axis=1)

In [7]:
train.Deal_value = train.Deal_value.apply(lambda x : str(x)[:-1] if str(x)[-1] == '$' else x)
train.Weighted_amount = train.Weighted_amount.apply(lambda x : str(x)[:-1] if str(x)[-1] == '$' else x)

In [8]:
train['min_revenue'] = train.Lead_revenue.apply(lambda x: x.split('-')[0])
train['max_revenue'] = train.Lead_revenue.apply(lambda x: x.split('-')[1])
train = train.drop(['Lead_revenue'], axis = 1)

In [9]:
train.min_revenue = train.min_revenue.apply(lambda x: '500' if x == '500 Million ' else x)
train.max_revenue = train.max_revenue.apply(lambda x: '100' if x == ' 100 Million' else ('500' if x == ' 500 Million' else '1000'))

In [10]:
train["Deal_value"] = train.Deal_value.astype(float)
train["Weighted_amount"] = train.Weighted_amount.astype(float)
train["min_revenue"] = train.min_revenue.astype(float)
train["max_revenue"] = train.max_revenue.astype(float)

In [11]:
train['Weighted_amount'] = train['Weighted_amount'].fillna((train['Weighted_amount'].mean()))
train['Deal_value'] = train['Deal_value'].fillna((train['Deal_value'].mean()))

In [12]:
train['Geography'] = train['Geography'].fillna(method = 'ffill')
train['Resource'] = train['Resource'].fillna(method = 'bfill')

In [13]:
def fill(train, s):
    n = 7007
    x = 'No'
    for i in range(n):
        if train[s][i] == '?':
            train[s][i] = x
        x = train[s][i]
    return train
train = fill(train, 'Last_lead_update')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train[s][i] = x


In [14]:
train.Success_probability = train.Success_probability.apply(lambda x: -x if x < 0 else x)

In [15]:
train.describe()

Unnamed: 0,Deal_value,Weighted_amount,Internal_rating,Success_probability,days_since_created,min_revenue,max_revenue
count,7007.0,7007.0,7007.0,7007.0,7007.0,7007.0,7007.0
mean,249656.025446,1569884.0,3.009562,64.845034,448.625089,219.430569,539.517625
std,144231.008181,886377.2,1.418666,17.56689,224.690222,202.070367,367.838423
min,1551.0,8708.0,1.0,5.0,60.0,50.0,100.0
25%,123293.5,826428.4,2.0,60.6,255.5,50.0,100.0
50%,248796.0,1569884.0,3.0,65.3,448.0,100.0,500.0
75%,375706.5,2293031.0,4.0,69.6,643.0,500.0,1000.0
max,500000.0,3601416.0,5.0,107.34,836.0,500.0,1000.0


In [16]:
X = train.drop(['Success_probability'], axis=1)
y = train.Success_probability

In [17]:
enc = OneHotEncoder(handle_unknown='ignore')
X = pd.DataFrame(enc.fit_transform(X).toarray())

In [18]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20940,20941,20942,20943,20944,20945,20946,20947,20948,20949
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# clean test data

In [19]:
Deal_title = test.Deal_title
test = test.drop(['Deal_title', 'Lead_name', 'Contact_no', 'Lead_POC_email'], axis = 1)

In [20]:
test['Date_of_creation'] = pd.to_datetime(test['Date_of_creation'])
test['days_since_created'] = test['Date_of_creation'].apply(lambda x:(pd.Timestamp('today') - x).days)
test.drop(['Date_of_creation'], inplace=True, axis=1)

In [21]:
test.Deal_value = test.Deal_value.apply(lambda x : str(x)[:-1] if str(x)[-1] == '$' else x)
test.Weighted_amount = test.Weighted_amount.apply(lambda x : str(x)[:-1] if str(x)[-1] == '$' else x)

In [22]:
test['min_revenue'] = test.Lead_revenue.apply(lambda x: x.split('-')[0])
test['max_revenue'] = test.Lead_revenue.apply(lambda x: x.split('-')[1])
test = test.drop(['Lead_revenue'], axis = 1)

In [23]:
test.min_revenue = test.min_revenue.apply(lambda x: '500' if x == '500 Million ' else x)
test.max_revenue = test.max_revenue.apply(lambda x: '100' if x == ' 100 Million' else ('500' if x == ' 500 Million' else '1000'))

In [24]:
test["Deal_value"] = test.Deal_value.astype(float)
test["Weighted_amount"] = test.Weighted_amount.astype(float)
test["min_revenue"] = test.min_revenue.astype(float)
test["max_revenue"] = test.max_revenue.astype(float)

In [25]:
test['Weighted_amount'] = test['Weighted_amount'].fillna((test['Weighted_amount'].mean()))
test['Deal_value'] = test['Deal_value'].fillna((test['Deal_value'].mean()))

In [26]:
test['Geography'] = test['Geography'].fillna(method = 'ffill')
test['Resource'] = test['Resource'].fillna(method = 'bfill')

In [27]:
def fill(test, s):
    n = 2093
    x = 'No'
    for i in range(n):
        if test[s][i] == '?':
            test[s][i] = x
        x = test[s][i]
    return test
test = fill(test, 'Last_lead_update')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[s][i] = x


In [28]:
test.Internal_rating = test.Internal_rating.apply(lambda x: -x if x < 0 else x)

In [29]:
test.describe()

Unnamed: 0,Deal_value,Weighted_amount,Internal_rating,days_since_created,min_revenue,max_revenue
count,2093.0,2093.0,2093.0,2093.0,2093.0,2093.0
mean,248052.138889,1556917.0,3.189221,449.614907,213.186813,525.70473
std,141862.423966,887399.4,4.478122,223.069168,200.502528,368.447914
min,2025.0,13162.5,1.0,60.0,50.0,100.0
25%,126390.0,804990.0,2.0,259.0,50.0,100.0
50%,250650.0,1556917.0,3.0,449.0,100.0,500.0
75%,372023.0,2311754.0,4.0,644.0,500.0,1000.0
max,499392.0,3409163.0,82.34,836.0,500.0,1000.0


In [30]:
test = pd.DataFrame(enc.transform(test).toarray())

In [31]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20940,20941,20942,20943,20944,20945,20946,20947,20948,20949
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
model = RandomForestRegressor(criterion="mae", max_features="sqrt", n_jobs=-1, n_estimators=100)
model.fit(X_train, y_train)
pred = model.predict(X_val)
score = max(0, 100-np.sqrt(mean_squared_error(y_val, pred)))
score

82.79187444798094

In [34]:
model1 = RandomForestRegressor(criterion="mse", max_features="sqrt", n_jobs=-1, n_estimators=100)
model1.fit(X_train, y_train)
pred1 = model1.predict(X_val)
score1 = max(0, 100-np.sqrt(mean_squared_error(y_val, pred1)))
score1

82.76838805792767

In [35]:
model2 = RandomForestRegressor(criterion="mae", max_features="log2", n_jobs=-1, n_estimators=100)
model2.fit(X_train, y_train)
pred2 = model2.predict(X_val)
score2 = max(0, 100-np.sqrt(mean_squared_error(y_val, pred2)))
score2

82.70172184023973

In [39]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
pred3 = regressor.predict(X_val)
score3 = max(0, 100-np.sqrt(mean_squared_error(y_val, pred3)))
score3

79.39670048445524

In [36]:
test_pred = model.predict(test)

In [37]:
output = pd.DataFrame({'Deal_title': Deal_title,
                       'Success_probability': test_pred})

In [38]:
output.to_csv('submission.csv', index=False)