In [2]:
# Import libraries and dependencies
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from pathlib import Path
%matplotlib inline

In [3]:
# Importing ignore warnings
 
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Prep data
data = Path('../Resources/train_data_feb19.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,Trip_date,Occupancy_Status,Occupancy_Cat
0,1-Feb-19,Many Seats Available,1
1,1-Feb-19,Many Seats Available,1
2,1-Feb-19,Many Seats Available,1
3,1-Feb-19,Many Seats Available,1
4,1-Feb-19,Many Seats Available,1


In [5]:
# Filling in NaNs

df.fillna("", inplace=True)
feb_df = df.drop(["Trip_date"], axis=1)
feb_df.tail()

Unnamed: 0,Occupancy_Status,Occupancy_Cat
329799,Many Seats Available,1
329800,Many Seats Available,1
329801,Many Seats Available,1
329802,Many Seats Available,1
329803,Many Seats Available,1


In [6]:
y = feb_df["Occupancy_Status"]
X = feb_df.drop(columns="Occupancy_Status")

In [7]:
# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(247353, 1)

In [8]:
# Create and train LR model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [9]:
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [10]:
# Execute LR predictions
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9872934631882371
Testing Data Score: 0.9872894203830154


In [11]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions,
                        "Actual": y_test}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,Many Seats Available,Many Seats Available
1,Many Seats Available,Many Seats Available
2,Few Seats Available,Few Seats Available
3,Many Seats Available,Many Seats Available
4,Many Seats Available,Many Seats Available


In [12]:
# Create a confusion matrix
confusion_matrix(y_test, predictions)

array([[ 3494,     0,     0],
       [    0, 77909,     0],
       [ 1048,     0,     0]], dtype=int64)

In [13]:
 # Create a classification report
print(classification_report(y_test, predictions))

                      precision    recall  f1-score   support

 Few Seats Available       0.77      1.00      0.87      3494
Many Seats Available       1.00      1.00      1.00     77909
  Standing Room Only       0.00      0.00      0.00      1048

            accuracy                           0.99     82451
           macro avg       0.59      0.67      0.62     82451
        weighted avg       0.98      0.99      0.98     82451

