In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
offers_df = pd.read_csv("../Data/Processed/offers_processed.csv")
leads_df = pd.read_csv("../Data/Processed/leads_processed.csv")
merge_df = pd.read_csv("../Data/Processed/merge_processed.csv")

In [3]:
leads_df.dtypes

Id                                 object
Acquisition Campaign_0              int64
Acquisition Campaign_1              int64
Acquisition Campaign_2              int64
Acquisition Campaign_3              int64
Acquisition Campaign_4              int64
Acquisition Campaign_5              int64
Converted                            bool
City_0                              int64
City_1                              int64
City_2                              int64
City_3                              int64
City_4                              int64
Use Case_Concerts and festivals      bool
Use Case_Corporate Events            bool
Use Case_Educational Seminars        bool
Use Case_Not_Specified               bool
Use Case_Sports Events               bool
Use Case_Wedding Planning            bool
Source_Inbound                       bool
Source_Not_Specified                 bool
Source_Outbound                      bool
month                               int64
day                               

# Dealing With Leads Dataset

In [4]:
# Creating X and y variables
X = leads_df.drop(["Id","Converted"], axis=1)
y = leads_df["Converted"]

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Creating and training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

feature_importances = clf.feature_importances_

# Predict the response for the test dataset
y_pred = clf.predict(X_test)


# Model Accuracy: how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8932347924957362


In [6]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
clf.fit(X_train_res, y_train_res)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.93      0.85      0.89      7995
        True       0.20      0.38      0.26       800

    accuracy                           0.80      8795
   macro avg       0.57      0.61      0.57      8795
weighted avg       0.87      0.80      0.83      8795



In [7]:
# Creating and training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

feature_importances = clf.feature_importances_

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.8917566799317794
              precision    recall  f1-score   support

       False       0.92      0.96      0.94      7995
        True       0.31      0.16      0.21       800

    accuracy                           0.89      8795
   macro avg       0.62      0.56      0.58      8795
weighted avg       0.86      0.89      0.88      8795



In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Create a Logistic Regression model
clf = LogisticRegression()
clf = LogisticRegression(max_iter=1000)

# Perform 5-fold cross validation
scores = cross_val_score(clf, X, y, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())


Cross-validation scores:  [0.90255827 0.90938033 0.90948374 0.90118262 0.90902888]
Average cross-validation score:  0.9063267696474533


In [9]:
# Create a Logistic Regression model
clf = RandomForestClassifier(n_estimators=100)

# Perform 5-fold cross validation
scores = cross_val_score(clf, X, y, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())

Cross-validation scores:  [0.86185333 0.85491757 0.78792358 0.73925404 0.90641346]
Average cross-validation score:  0.8300723954766305


In [10]:
# Create a Logistic Regression model
clf = LogisticRegression()
clf = LogisticRegression(max_iter=1000)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
clf.fit(X_train_res, y_train_res)

# Predict the response for the test dataset
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.94      0.74      0.83      7995
        True       0.18      0.57      0.27       800

    accuracy                           0.72      8795
   macro avg       0.56      0.65      0.55      8795
weighted avg       0.87      0.72      0.78      8795



# Dealing With Offers Dataset

In [11]:
offers_df.columns

Index(['Id', 'Status', 'Price', 'Days', 'Discount',
       'Use Case_Concerts and festivals', 'Use Case_Corporate Events',
       'Use Case_Educational Seminars', 'Use Case_Not_Specified',
       'Use Case_Sports Events', 'Use Case_Wedding Planning',
       'Pain_Not_Specified', 'Pain_financial control', 'Pain_operations',
       'Pain_quality of delivery', 'Pain_time saving'],
      dtype='object')

In [12]:
# Creating X and y variables
X = offers_df.drop(["Id","Status"], axis=1)
y = offers_df["Status"]

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Creating and training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

feature_importances = clf.feature_importances_

# Predict the response for the test dataset
y_pred = clf.predict(X_test)


# Model Accuracy: how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7472306143001007


# Merged

In [22]:
merge_df = merge_df[merge_df['Status'].isin(['Closed Lost', 'Closed Won'])]
merge_df.to_csv("../Data/Processed/merge_processed.csv")

In [20]:
# Creating X and y variables
X = merge_df.drop(["Id","Status"], axis=1)
y = merge_df["Status"]

# Creating training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Create a Logistic Regression model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy*100:.2f}%')

# Perform 5-fold cross validation
scores = cross_val_score(model, X, y, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())

Accuracy: 82.94%
Cross-validation scores:  [0.83676471 0.82058824 0.82794118 0.81001473 0.7820324 ]
Average cross-validation score:  0.8154682491553322


In [18]:
# Create a Logistic Regression model
clf = LogisticRegression()
clf = LogisticRegression(max_iter=1000)

# Perform 5-fold cross validation
scores = cross_val_score(clf, X, y, cv=5)

# Print cross validation scores
print("Cross-validation scores: ", scores)

# Print the average of the cross-validation scores
print("Average cross-validation score: ", scores.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores:  [0.65006729 0.56393001 0.65006729 0.5916442  0.60242588]
Average cross-validation score:  0.6116269367647005


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
