In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore', FutureWarning)

In [2]:
#Read the data into a pandas Dataframe

# File '2016_NAICS_452990.csv' was copied to a new workbook and renamed 'Data_2016.csv'

df_read = pd.read_csv('Data_2016.csv', encoding='latin-1')

In [3]:
df = df_read[['GEO.id2', 'EMP', 'ESTAB']].copy()
df = df.rename (columns={'GEO.id2':'County', 'EMP':'NumEmployees', 'ESTAB':'NumEstab'})
df.head()

Unnamed: 0,County,NumEmployees,NumEstab
0,1001,109,11
1,1003,410,52
2,1005,65,8
3,1007,28,5
4,1009,b,12


In [4]:
print("County: ", df['County'].dtype)
print("Num Employees: ", df['NumEmployees'].dtype)

County:  int64
Num Employees:  object


In [5]:
#Convert Categorical Variables into Dummy Variables

# drop_first = True omits 1st row of data.

df = pd.concat([df,pd.get_dummies(df['County'],drop_first=False,prefix="County")],axis=1)
df = pd.concat([df,pd.get_dummies(df['NumEmployees'],drop_first=False,prefix="NumEmployees")],axis=1)

In [6]:
#Drop the original Categorical Variables
df.drop(['County', 'NumEmployees'],axis=1,inplace=True)
df.head()

Unnamed: 0,NumEstab,County_1001,County_1003,County_1005,County_1007,County_1009,County_1011,County_1013,County_1015,County_1017,...,NumEmployees_973,NumEmployees_98,NumEmployees_980,NumEmployees_989,NumEmployees_99,NumEmployees_993,NumEmployees_994,NumEmployees_a,NumEmployees_b,NumEmployees_c
0,11,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,52,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [7]:
# Set to 1 if there are 10+ establishments.
df['NumEstab'] = df['NumEstab'].apply(lambda x:1 if x>=10 else 0)

df.head()

Unnamed: 0,NumEstab,County_1001,County_1003,County_1005,County_1007,County_1009,County_1011,County_1013,County_1015,County_1017,...,NumEmployees_973,NumEmployees_98,NumEmployees_980,NumEmployees_989,NumEmployees_99,NumEmployees_993,NumEmployees_994,NumEmployees_a,NumEmployees_b,NumEmployees_c
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
#Create the train and test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('NumEstab',axis=1), 
                                                    df['NumEstab'], test_size=0.30, 
                                                    random_state=10)

In [9]:
from sklearn.linear_model import LogisticRegression

#Train the model
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

#Predicting on the Test Set
predictions = logmodel.predict(X_test)

In [10]:
predictions

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,

In [11]:
#Model Evaluation
from sklearn.metrics import classification_report


In [12]:
truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))

Accuracy: 0.8911335578002245


In [13]:
#create a dataframe containing only the categorical variables. 
categoricals = df.drop('NumEstab',axis=1)
print(categoricals)

index_dict = dict(zip(categoricals.columns,range(categoricals.shape[1])))
print(index_dict)

      County_1001  County_1003  County_1005  County_1007  County_1009  \
0               1            0            0            0            0   
1               0            1            0            0            0   
2               0            0            1            0            0   
3               0            0            0            1            0   
4               0            0            0            0            1   
5               0            0            0            0            0   
6               0            0            0            0            0   
7               0            0            0            0            0   
8               0            0            0            0            0   
9               0            0            0            0            0   
10              0            0            0            0            0   
11              0            0            0            0            0   
12              0            0            0        

In [14]:
categorical_vector = np.zeros(len(index_dict))
print(categorical_vector)

[0. 0. 0. ... 0. 0. 0.]


In [15]:
# Prediction
categorical_vector[index_dict['County_1007']] = 1  
categorical_vector[index_dict['NumEmployees_28']] = 1 

In [16]:
# Makes it into a 2 dimensional array
categorical_vector = np.array(categorical_vector).reshape((1, -1))

In [17]:
print(categorical_vector)

[[0. 0. 0. ... 0. 0. 0.]]


In [18]:
prediction = logmodel.predict(categorical_vector)

In [19]:
print(prediction)

[0]
