 # Logistic Regression

 Logistic Regression is a statistical method for predicting binary outcomes from data.

 Examples of this are "yes" vs "no" or "Above 25%ile" vs "Below 25%ile".

 These are categories that translate to probability of being a 0 or a 1

##### 

 We can calculate logistic regression by adding an activation function as the final step to our linear model.

 This converts the linear regression output to a probability.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [2]:
#setting 

columns = [
            'CD','total_rats', 'tons_of_refuge','tons_of_MGP', 'tons_of_paper', 'tons_res_organics', 'tons_sch_organics', 'tons_leaves_organics',
             'tons_xmastrees', 'pct_LU1', 'pct_LU2', 'pct_LU3', 'pct_LU4', 'pct_LU5', 'pct_LU6', 'pct_LU7',
             'pct_LU8', 'pct_LU9', 'pct_LU10', 'pct_LU11', 'percentile'
]

target = ["percentile"]

In [3]:
# Load the data
# "mock-data" file generated from xls spreadsheets of actual data to be used after SQL db developement

df = pd.read_csv('rats_Garb_LU.csv')
df.head()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.tail(20)

# Fill the null rows with zeros assuming particularly 
#garbage collection of organics etc is not performed and is included in refuge
df.fillna(value=0, axis=1)
df.tail(20)
#df.count()

Unnamed: 0,CD,total_rats,tons_of_refuge,tons_of_MGP,tons_of_paper,tons_res_organics,tons_sch_organics,tons_leaves_organics,tons_xmastrees,pct_LU1,pct_LU2,pct_LU3,pct_LU4,pct_LU5,pct_LU6,pct_LU7,pct_LU8,pct_LU9,pct_LU10,pct_LU11
39,10 MANHATTAN,6831,1131518.8,54826.4,53265.6,102.9,1005.6,54.9,501.3,4.1,19.8,22.3,23.2,4.9,0.8,0.9,14.3,6.1,0.6,3.0
40,11 MANHATTAN,3286,1006505.2,44949.0,47872.4,3.8,778.3,10.8,360.9,0.3,4.4,17.5,7.1,1.9,0.4,10.0,9.5,43.1,1.5,4.3
41,12 MANHATTAN,4920,2139581.2,147001.7,112987.8,824.9,2409.6,0.0,833.4,0.4,8.4,10.6,9.8,2.5,0.4,7.7,8.0,43.7,1.8,6.9
42,01 QUEENS,3032,1744235.6,163543.1,218689.2,0.0,0.0,1454.6,894.7,16.4,17.5,7.4,6.0,6.8,8.7,20.1,4.0,7.4,2.3,3.4
43,02 QUEENS,2348,1060287.7,98901.4,134628.3,2327.0,1677.4,720.2,596.6,11.4,8.4,4.5,4.1,6.1,23.6,12.8,3.9,15.4,5.8,4.0
44,03 QUEENS,1752,1808392.6,133626.9,138515.3,0.0,0.0,1258.6,618.5,42.6,18.3,9.3,5.2,9.8,0.5,1.1,6.5,3.9,1.9,0.9
45,04 QUEENS,1641,1590480.8,111604.2,125139.7,0.0,0.0,668.7,393.4,27.3,28.3,11.0,8.1,6.6,1.9,5.3,6.8,2.1,1.3,1.3
46,05 QUEENS,4083,1857490.3,198092.1,261565.0,10608.9,4934.7,2867.6,1319.9,31.0,9.8,0.5,3.0,3.0,9.1,6.8,2.4,31.9,1.3,1.1
47,06 QUEENS,970,1094103.8,98775.5,168834.0,0.0,243.5,3569.3,491.0,41.8,4.4,16.3,5.9,5.9,0.1,6.0,5.5,12.1,0.6,1.2
48,07 QUEENS,1752,2461125.3,203281.2,328594.4,6053.2,0.0,8761.8,1569.2,33.4,6.1,4.2,2.6,4.1,4.0,3.0,5.4,10.7,1.8,24.7


In [4]:
#getting stats for rats

data = df.total_rats

# removing null values to avoid errors 
data.dropna(inplace = True) 
  
# percentile list
perc =[.25, .40, .60, .75]
  
# list of dtypes to include
include =['object', 'float', 'int']
  
# calling describe method
desc = data.describe(percentiles = perc, include = include)
  
# display
desc

count      59.000000
mean     2897.949153
std      1775.622945
min       522.000000
25%      1572.000000
40%      2160.400000
50%      2517.000000
60%      2970.400000
75%      3625.500000
max      9437.000000
Name: total_rats, dtype: float64

In [5]:
df['percentile'] = np.where(df['total_rats']< 1572 , 'Below' , 'Above' )

df.head()
df.dtypes

CD                       object
total_rats                int64
tons_of_refuge          float64
tons_of_MGP             float64
tons_of_paper           float64
tons_res_organics       float64
tons_sch_organics       float64
tons_leaves_organics    float64
tons_xmastrees          float64
pct_LU1                 float64
pct_LU2                 float64
pct_LU3                 float64
pct_LU4                 float64
pct_LU5                 float64
pct_LU6                 float64
pct_LU7                 float64
pct_LU8                 float64
pct_LU9                 float64
pct_LU10                float64
pct_LU11                float64
percentile               object
dtype: object

In [6]:
# Convert the target column values to Below and Above based on rat data 25%tile values

x = {'Below': 'Below Percentile'}   
df = df.replace(x)

x = dict.fromkeys(['Above'], 'Above_Percentile')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

#df.head()
df.tail(20)

Unnamed: 0,CD,total_rats,tons_of_refuge,tons_of_MGP,tons_of_paper,tons_res_organics,tons_sch_organics,tons_leaves_organics,tons_xmastrees,pct_LU1,...,pct_LU3,pct_LU4,pct_LU5,pct_LU6,pct_LU7,pct_LU8,pct_LU9,pct_LU10,pct_LU11,percentile
39,10 MANHATTAN,6831,1131518.8,54826.4,53265.6,102.9,1005.6,54.9,501.3,4.1,...,22.3,23.2,4.9,0.8,0.9,14.3,6.1,0.6,3.0,Above_Percentile
40,11 MANHATTAN,3286,1006505.2,44949.0,47872.4,3.8,778.3,10.8,360.9,0.3,...,17.5,7.1,1.9,0.4,10.0,9.5,43.1,1.5,4.3,Above_Percentile
41,12 MANHATTAN,4920,2139581.2,147001.7,112987.8,824.9,2409.6,0.0,833.4,0.4,...,10.6,9.8,2.5,0.4,7.7,8.0,43.7,1.8,6.9,Above_Percentile
42,01 QUEENS,3032,1744235.6,163543.1,218689.2,0.0,0.0,1454.6,894.7,16.4,...,7.4,6.0,6.8,8.7,20.1,4.0,7.4,2.3,3.4,Above_Percentile
43,02 QUEENS,2348,1060287.7,98901.4,134628.3,2327.0,1677.4,720.2,596.6,11.4,...,4.5,4.1,6.1,23.6,12.8,3.9,15.4,5.8,4.0,Above_Percentile
44,03 QUEENS,1752,1808392.6,133626.9,138515.3,0.0,0.0,1258.6,618.5,42.6,...,9.3,5.2,9.8,0.5,1.1,6.5,3.9,1.9,0.9,Above_Percentile
45,04 QUEENS,1641,1590480.8,111604.2,125139.7,0.0,0.0,668.7,393.4,27.3,...,11.0,8.1,6.6,1.9,5.3,6.8,2.1,1.3,1.3,Above_Percentile
46,05 QUEENS,4083,1857490.3,198092.1,261565.0,10608.9,4934.7,2867.6,1319.9,31.0,...,0.5,3.0,3.0,9.1,6.8,2.4,31.9,1.3,1.1,Above_Percentile
47,06 QUEENS,970,1094103.8,98775.5,168834.0,0.0,243.5,3569.3,491.0,41.8,...,16.3,5.9,5.9,0.1,6.0,5.5,12.1,0.6,1.2,Below Percentile
48,07 QUEENS,1752,2461125.3,203281.2,328594.4,6053.2,0.0,8761.8,1569.2,33.4,...,4.2,2.6,4.1,4.0,3.0,5.4,10.7,1.8,24.7,Above_Percentile


 # Split our data into training and testing

In [7]:
# Create our features
# create and drop 'fiftieth2517' column then assign to X
X = df.drop(['CD','total_rats', 'percentile'], axis=1)

# Create our target
y = df['percentile']

X.head()

Unnamed: 0,tons_of_refuge,tons_of_MGP,tons_of_paper,tons_res_organics,tons_sch_organics,tons_leaves_organics,tons_xmastrees,pct_LU1,pct_LU2,pct_LU3,pct_LU4,pct_LU5,pct_LU6,pct_LU7,pct_LU8,pct_LU9,pct_LU10,pct_LU11
0,764007.8,32663.3,28960.3,141.5,0.0,15.6,148.7,6.1,7.9,15.2,9.9,5.5,21.3,6.6,11.4,7.0,4.2,4.8
1,582803.0,34424.3,24529.8,0.0,0.0,53.0,173.9,4.4,6.5,3.3,4.4,3.1,39.1,12.1,7.4,3.9,2.5,13.3
2,723072.8,34086.5,28189.1,0.0,0.0,16.5,91.3,10.3,12.6,13.3,10.1,5.8,6.7,2.6,12.5,20.4,3.5,2.2
3,1598950.4,78164.7,61554.6,343.0,0.0,0.0,269.4,4.3,12.3,15.3,10.5,7.0,2.6,8.3,12.3,16.8,6.3,4.2
4,1351276.7,88723.5,58439.9,21.3,0.0,11.9,202.6,11.3,16.4,18.1,14.7,6.7,1.3,3.2,15.3,6.4,2.1,4.7


In [8]:
X.describe()

Unnamed: 0,tons_of_refuge,tons_of_MGP,tons_of_paper,tons_res_organics,tons_sch_organics,tons_leaves_organics,tons_xmastrees,pct_LU1,pct_LU2,pct_LU3,pct_LU4,pct_LU5,pct_LU6,pct_LU7,pct_LU8,pct_LU9,pct_LU10,pct_LU11
count,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0,59.0
mean,1427666.0,114866.962712,140106.747458,2149.745763,820.545763,2416.198305,724.067797,22.986441,11.245763,9.679661,8.108475,6.305085,4.235593,7.433898,9.481356,13.230508,1.694915,5.615254
std,525484.3,51151.167718,77541.70258,2942.036193,1173.138766,4134.567745,605.304691,17.685416,7.658584,6.992429,6.897858,8.742019,7.305849,7.70637,5.746857,11.32273,1.33448,6.903168
min,401711.6,32663.3,24529.8,0.0,0.0,0.0,91.3,0.1,0.6,0.2,0.4,1.4,0.0,0.4,2.4,0.7,0.2,0.4
25%,1033396.0,80860.55,81880.75,0.0,0.0,16.05,337.85,5.55,6.0,4.35,3.55,3.1,0.5,3.1,5.45,4.3,0.8,1.3
50%,1351277.0,111604.2,132982.5,875.0,0.0,480.3,549.3,22.3,9.0,8.4,6.7,4.1,2.1,5.8,8.0,8.7,1.3,3.2
75%,1798688.0,142588.35,182604.95,3025.35,1588.45,3004.15,891.85,38.8,15.3,14.9,9.95,6.0,3.5,8.9,12.4,18.55,2.05,5.65
max,2674158.0,227144.9,348823.5,10618.1,4934.7,19312.1,3161.5,59.9,32.3,37.1,33.2,65.5,39.1,48.8,32.4,46.0,6.3,36.4


In [9]:
y.describe()

count                   59
unique                   2
top       Above_Percentile
freq                    44
Name: percentile, dtype: object

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

 # Create a Logistic Regression Model

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=500,
                                random_state=1)
classifier

LogisticRegression(max_iter=500, random_state=1)

 # Fit (train) or model using the training data

In [12]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=1)

 # Make predictions

In [13]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
1,Above_Percentile,Above_Percentile
13,Above_Percentile,Above_Percentile
35,Above_Percentile,Below Percentile
46,Below Percentile,Above_Percentile
10,Above_Percentile,Above_Percentile
57,Above_Percentile,Above_Percentile
8,Above_Percentile,Above_Percentile
0,Above_Percentile,Above_Percentile
34,Above_Percentile,Below Percentile
17,Below Percentile,Above_Percentile


# Validate the model using the test data 

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.7333333333333333

In [19]:
##### Generate a new data point whose rat number is 1309 and is bellow the current %tile
# import numpy as np
new_data = np.array([[1788984.2,141994.4,238538,3386.5,247,3136.5,445.2,49.6,7.8,9.9,4.6,5.3,0.4,3.6,8.3,3.6,0.6,6.2]])



In [20]:
# Predict the class (purple or yellow) of the new data point
predictions = classifier.predict(new_data)
print("Classes are either 0 (purple) or 1 (yellow)")
print(f"The new point was classified as: {predictions}")

Classes are either 0 (purple) or 1 (yellow)
The new point was classified as: ['Below Percentile']


  "X does not have valid feature names, but"
