In [3]:
from path import Path
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [4]:
data = Path('../Resources/Travel.csv')
vacay_df = pd.read_csv(data)
vacay_df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [3]:
# Verify row Count to look for null or NaN data
vacay_df.count()

CustomerID                  4888
ProdTaken                   4888
Age                         4662
TypeofContact               4863
CityTier                    4888
DurationOfPitch             4637
Occupation                  4888
Gender                      4888
NumberOfPersonVisiting      4888
NumberOfFollowups           4843
ProductPitched              4888
PreferredPropertyStar       4862
MaritalStatus               4888
NumberOfTrips               4748
Passport                    4888
PitchSatisfactionScore      4888
OwnCar                      4888
NumberOfChildrenVisiting    4822
Designation                 4888
MonthlyIncome               4655
dtype: int64

In [4]:
#Drop the rows from data frame that are missing values
vacay_df.dropna(inplace = True)

In [5]:
# Verify null/NaN rows were dropped and new values returned to DF
vacay_df.count()

CustomerID                  4128
ProdTaken                   4128
Age                         4128
TypeofContact               4128
CityTier                    4128
DurationOfPitch             4128
Occupation                  4128
Gender                      4128
NumberOfPersonVisiting      4128
NumberOfFollowups           4128
ProductPitched              4128
PreferredPropertyStar       4128
MaritalStatus               4128
NumberOfTrips               4128
Passport                    4128
PitchSatisfactionScore      4128
OwnCar                      4128
NumberOfChildrenVisiting    4128
Designation                 4128
MonthlyIncome               4128
dtype: int64

In [6]:
# Separate the Features (X) from the Target (y)
y = vacay_df["ProdTaken"]
X = vacay_df.drop(columns="ProdTaken")

In [7]:
#Check the balance of the model
y.value_counts("ProdTaken")

0    0.806928
1    0.193072
Name: ProdTaken, dtype: float64

In [8]:
#Check the balance of the model
vacay_df.value_counts("ProdTaken")

ProdTaken
0    3331
1     797
dtype: int64

In [9]:
#verifying the data types
X.dtypes

CustomerID                    int64
Age                         float64
TypeofContact                object
CityTier                      int64
DurationOfPitch             float64
Occupation                   object
Gender                       object
NumberOfPersonVisiting        int64
NumberOfFollowups           float64
ProductPitched               object
PreferredPropertyStar       float64
MaritalStatus                object
NumberOfTrips               float64
Passport                      int64
PitchSatisfactionScore        int64
OwnCar                        int64
NumberOfChildrenVisiting    float64
Designation                  object
MonthlyIncome               float64
dtype: object

In [10]:
# Generate our categorical variable list
vacay_cat = vacay_df.dtypes[vacay_df.dtypes == "object"].index.tolist()
                            
# Check the number of unique values in each column
vacay_df[vacay_cat].nunique()

TypeofContact     2
Occupation        4
Gender            3
ProductPitched    5
MaritalStatus     4
Designation       5
dtype: int64

In [11]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and Transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(vacay_df[vacay_cat]))

#Add teh encded variablee names to the DataFrame
encode_df.columns = enc.get_feature_names(vacay_cat)
encode_df.head()

Unnamed: 0,TypeofContact_Company Invited,TypeofContact_Self Enquiry,Occupation_Free Lancer,Occupation_Large Business,Occupation_Salaried,Occupation_Small Business,Gender_Fe Male,Gender_Female,Gender_Male,ProductPitched_Basic,...,ProductPitched_Super Deluxe,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Unmarried,Designation_AVP,Designation_Executive,Designation_Manager,Designation_Senior Manager,Designation_VP
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
# Merge one -hot encoded features and drop the originals
vcay_df = vacay_df.merge(encode_df, left_index=True, right_index=True)
vacay_df= vacay_df.drop(vacay_cat,1)
vacay_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,CustomerID,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,MonthlyIncome
0,200000,1,41.0,3,6.0,3,3.0,3.0,1.0,1,2,1,0.0,20993.0
1,200001,0,49.0,1,14.0,3,4.0,4.0,2.0,0,3,1,2.0,20130.0
2,200002,1,37.0,1,8.0,3,4.0,3.0,7.0,1,3,0,0.0,17090.0
3,200003,0,33.0,1,9.0,2,3.0,3.0,2.0,1,5,1,1.0,17909.0
5,200005,0,32.0,1,8.0,3,3.0,3.0,1.0,0,5,1,1.0,18068.0


In [13]:
# Remove ProdTaken target from features data
y = vacay_df.ProdTaken
X = vacay_df.drop(columns=["ProdTaken"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train.shape

(3096, 13)

In [14]:
# Create Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [15]:
#Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [1]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}.reset_index(drop=True))
results.head(20)

SyntaxError: invalid syntax (Temp/ipykernel_15032/2090727028.py, line 3)

In [17]:

print(accuracy_score(y_test, y_pred))

0.8071705426356589


In [18]:
#Confustion Matrixx
lr_matrix = confusion_matrix(y_test, y_pred)
display(lr_matrix)
#print(lr_matrix)

array([[833,   0],
       [199,   0]], dtype=int64)

In [19]:
lr_report = classification_report(y_test, y_pred)
print("Logistic Classification Report")
print(lr_report)

Logistic Classification Report
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       833
           1       0.00      0.00      0.00       199

    accuracy                           0.81      1032
   macro avg       0.40      0.50      0.45      1032
weighted avg       0.65      0.81      0.72      1032



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
from collections import Counter
Counter(y_train)
Counter(y_test)

Counter({0: 833, 1: 199})

In [21]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

#Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

#Evaluate the model
y_pred=rf_model.predict(X_test_scaled)
print(f"Random Forest Predictive Accuracy: {accuracy_score(y_test,y_pred):.3f}")

Random Forest Predictive Accuracy: 0.886


In [22]:
#Evaluate the model
print("Random Forest Classification Report")
rf_report = classification_report(y_test, y_pred)
print(rf_report)
y_pred=rf_model.predict(X_test_scaled)
print(f"Random Forest Predictive Accuracy: {accuracy_score(y_test,y_pred):.3f}")

# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df


Random Forest Classification Report
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       833
           1       0.88      0.47      0.61       199

    accuracy                           0.89      1032
   macro avg       0.88      0.73      0.77      1032
weighted avg       0.88      0.89      0.87      1032

Random Forest Predictive Accuracy: 0.886


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,820,13
Actual 1,105,94
