In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [120]:
offers_df = pd.read_csv("../Data/Processed/offers_processed.csv")
leads_df = pd.read_csv("../Data/Processed/leads_processed.csv")
merge_df = pd.read_csv("../Data/Processed/merge_processed.csv")

In [121]:
leads_df.dtypes

Id                                 object
Acquisition Campaign_0              int64
Acquisition Campaign_1              int64
Acquisition Campaign_2              int64
Acquisition Campaign_3              int64
Acquisition Campaign_4              int64
Acquisition Campaign_5              int64
Converted                            bool
City_0                              int64
City_1                              int64
City_2                              int64
City_3                              int64
City_4                              int64
Use Case_Concerts and festivals     int64
Use Case_Corporate Events           int64
Use Case_Educational Seminars       int64
Use Case_Not_Specified              int64
Use Case_Sports Events              int64
Use Case_Wedding Planning           int64
Source_Inbound                      int64
Source_Not_Specified                int64
Source_Outbound                     int64
month                               int64
day                               

# Dealing With Leads Dataset

In [122]:
# Creating X and y variables
X = leads_df.drop(["Id","Converted"], axis=1)
y = leads_df["Converted"]

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [123]:
# Creating and training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

feature_importances = clf.feature_importances_

# Predict the response for the test dataset
y_pred = clf.predict(X_test)


# Model Accuracy: how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8923251847640705


In [124]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
clf.fit(X_train_res, y_train_res)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.93      0.85      0.89      7995
        True       0.20      0.37      0.26       800

    accuracy                           0.80      8795
   macro avg       0.56      0.61      0.57      8795
weighted avg       0.86      0.80      0.83      8795



In [125]:
# Creating and training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

feature_importances = clf.feature_importances_

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.8908470722001137
              precision    recall  f1-score   support

       False       0.92      0.96      0.94      7995
        True       0.31      0.16      0.21       800

    accuracy                           0.89      8795
   macro avg       0.61      0.56      0.58      8795
weighted avg       0.86      0.89      0.87      8795



In [126]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Create a Logistic Regression model
clf = LogisticRegression()
clf = LogisticRegression(max_iter=1000)

# Perform 5-fold cross validation
scores = cross_val_score(clf, X, y, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())


Cross-validation scores:  [0.90244457 0.90938033 0.90959745 0.90084148 0.90902888]
Average cross-validation score:  0.9062585438958264


In [127]:
# Create a Logistic Regression model
clf = RandomForestClassifier(n_estimators=100)

# Perform 5-fold cross validation
scores = cross_val_score(clf, X, y, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())

Cross-validation scores:  [0.86378624 0.85332575 0.790539   0.73345463 0.90618604]
Average cross-validation score:  0.8294583326814771


In [128]:
# Create a Logistic Regression model
clf = LogisticRegression()
clf = LogisticRegression(max_iter=1000)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
clf.fit(X_train_res, y_train_res)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.94      0.74      0.83      7995
        True       0.18      0.57      0.27       800

    accuracy                           0.72      8795
   macro avg       0.56      0.65      0.55      8795
weighted avg       0.87      0.72      0.78      8795



# Dealing With Offers Dataset

In [129]:
offers_df.columns

Index(['Id', 'Status', 'Price', 'Days', 'Discount',
       'Use Case_Concerts and festivals', 'Use Case_Corporate Events',
       'Use Case_Educational Seminars', 'Use Case_Not_Specified',
       'Use Case_Sports Events', 'Use Case_Wedding Planning',
       'Pain_Not_Specified', 'Pain_financial control', 'Pain_operations',
       'Pain_quality of delivery', 'Pain_time saving'],
      dtype='object')

In [130]:
# Creating X and y variables
X = offers_df.drop(["Id","Status"], axis=1)
y = offers_df["Status"]

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [131]:
# Creating and training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

feature_importances = clf.feature_importances_

# Predict the response for the test dataset
y_pred = clf.predict(X_test)


# Model Accuracy: how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7552870090634441


# Merged

In [132]:
merge_df

Unnamed: 0,Id,Acquisition Campaign_0,Acquisition Campaign_1,Acquisition Campaign_2,Acquisition Campaign_3,Acquisition Campaign_4,Acquisition Campaign_5,Converted,City_0,City_1,...,day,Status,Price,Days,Discount,Pain_Not_Specified,Pain_financial control,Pain_operations,Pain_quality of delivery,Pain_time saving
0,su014jpj,0,0,0,0,0,1,True,0,0,...,17,Closed Lost,240,-255,True,0,0,0,1,0
1,uo3alag3,0,0,0,0,0,1,True,0,0,...,17,Closed Lost,200,0,True,0,1,0,0,0
2,2exqos94,0,0,0,0,0,1,True,0,0,...,1,Checkbox,960,0,False,1,0,0,0,0
3,387283th,0,0,0,0,0,1,True,0,0,...,17,Closed Won,240,0,False,0,0,1,0,0
4,ipkk8eiv,0,0,0,0,0,1,True,0,0,...,1,Closed Lost,360,170,False,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3708,z35lw3kn,0,0,0,1,1,0,True,0,0,...,26,Closed Lost,0,0,True,0,1,0,0,0
3709,b1rznob8,0,1,0,0,0,1,True,0,0,...,26,Closed Won,240,0,False,0,0,1,0,0
3710,3iab5imb,0,1,1,0,1,0,True,0,0,...,27,Negotiation,300,0,False,0,0,1,0,0
3711,vxabmzpv,0,1,0,1,0,0,True,0,0,...,27,Closed Won,300,0,False,1,0,0,0,0


In [140]:
# Creating X and y variables
X = merge_df.drop(["Id","Status"], axis=1)
y = merge_df["Status"]

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [141]:
# Create a Logistic Regression model
clf = RandomForestClassifier(n_estimators=100)

# Perform 5-fold cross validation
scores = cross_val_score(clf, X, y, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())

Cross-validation scores:  [0.76312248 0.74831763 0.76312248 0.76415094 0.68867925]
Average cross-validation score:  0.7454785545595368
