In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('branch.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   BranchID  20 non-null     int64 
 1   Country   20 non-null     object
 2   State     20 non-null     object
 3   City      20 non-null     object
 4   Address   20 non-null     object
 5   ZIPCODE   20 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 1.1+ KB


In [4]:
data.head()

Unnamed: 0,BranchID,Country,State,City,Address,ZIPCODE
0,1,United States,Illinois,Skokie,5127 Oakton Street,60077
1,2,United States,Wisconsin,Waukesha,201 Delafield Street,53188
2,3,United States,Illinois,Chicago,121 North LaSalle Street,60602
3,4,United States,New York,Buffalo,65 Niagara Square,14202
4,5,United States,Ohio,Dayton,101 West Third Street,45402


In [5]:
data['Country_lowercase'] = data['Country'].str.lower()

In [6]:
data.head()

Unnamed: 0,BranchID,Country,State,City,Address,ZIPCODE,Country_lowercase
0,1,United States,Illinois,Skokie,5127 Oakton Street,60077,united states
1,2,United States,Wisconsin,Waukesha,201 Delafield Street,53188,united states
2,3,United States,Illinois,Chicago,121 North LaSalle Street,60602,united states
3,4,United States,New York,Buffalo,65 Niagara Square,14202,united states
4,5,United States,Ohio,Dayton,101 West Third Street,45402,united states


In [7]:
data['State_lowercase'] = data['State'].str.lower()
data['City_lowercase'] = data['City'].str.lower()
data['Address_lowercase'] = data['Address'].str.lower()

In [8]:
data.head()

Unnamed: 0,BranchID,Country,State,City,Address,ZIPCODE,Country_lowercase,State_lowercase,City_lowercase,Address_lowercase
0,1,United States,Illinois,Skokie,5127 Oakton Street,60077,united states,illinois,skokie,5127 oakton street
1,2,United States,Wisconsin,Waukesha,201 Delafield Street,53188,united states,wisconsin,waukesha,201 delafield street
2,3,United States,Illinois,Chicago,121 North LaSalle Street,60602,united states,illinois,chicago,121 north lasalle street
3,4,United States,New York,Buffalo,65 Niagara Square,14202,united states,new york,buffalo,65 niagara square
4,5,United States,Ohio,Dayton,101 West Third Street,45402,united states,ohio,dayton,101 west third street


In [10]:
data['Country_tokenized'] = data.apply(lambda x: word_tokenize(x['Country_lowercase']), axis=1)
data['City_tokenized'] = data.apply(lambda x: word_tokenize(x['City_lowercase']), axis=1)
data['State_tokenized'] = data.apply(lambda x: word_tokenize(x['State_lowercase']), axis=1)
data['Address_tokenized'] = data.apply(lambda x: word_tokenize(x['Address_lowercase']), axis=1)

In [11]:
data.head()

Unnamed: 0,BranchID,Country,State,City,Address,ZIPCODE,Country_lowercase,State_lowercase,City_lowercase,Address_lowercase,Country_tokenized,City_tokenized,State_tokenized,Address_tokenized
0,1,United States,Illinois,Skokie,5127 Oakton Street,60077,united states,illinois,skokie,5127 oakton street,"[united, states]",[skokie],[illinois],"[5127, oakton, street]"
1,2,United States,Wisconsin,Waukesha,201 Delafield Street,53188,united states,wisconsin,waukesha,201 delafield street,"[united, states]",[waukesha],[wisconsin],"[201, delafield, street]"
2,3,United States,Illinois,Chicago,121 North LaSalle Street,60602,united states,illinois,chicago,121 north lasalle street,"[united, states]",[chicago],[illinois],"[121, north, lasalle, street]"
3,4,United States,New York,Buffalo,65 Niagara Square,14202,united states,new york,buffalo,65 niagara square,"[united, states]",[buffalo],"[new, york]","[65, niagara, square]"
4,5,United States,Ohio,Dayton,101 West Third Street,45402,united states,ohio,dayton,101 west third street,"[united, states]",[dayton],[ohio],"[101, west, third, street]"


In [12]:
lemmatizer = WordNetLemmatizer()

In [14]:
data['Country_lemmatized'] = data['Country_tokenized'].apply(
    lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]
)

data['City_lemmatized'] = data['City_tokenized'].apply(
    lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]
)

data['State_lemmatized'] = data['State_tokenized'].apply(
    lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]
)

data['Address_lemmatized'] = data['Address_tokenized'].apply(
    lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]
)


In [15]:
data.head()

Unnamed: 0,BranchID,Country,State,City,Address,ZIPCODE,Country_lowercase,State_lowercase,City_lowercase,Address_lowercase,Country_tokenized,City_tokenized,State_tokenized,Address_tokenized,Country_lemmatized,City_lemmatized,State_lemmatized,Address_lemmatized
0,1,United States,Illinois,Skokie,5127 Oakton Street,60077,united states,illinois,skokie,5127 oakton street,"[united, states]",[skokie],[illinois],"[5127, oakton, street]","[united, state]",[skokie],[illinois],"[5127, oakton, street]"
1,2,United States,Wisconsin,Waukesha,201 Delafield Street,53188,united states,wisconsin,waukesha,201 delafield street,"[united, states]",[waukesha],[wisconsin],"[201, delafield, street]","[united, state]",[waukesha],[wisconsin],"[201, delafield, street]"
2,3,United States,Illinois,Chicago,121 North LaSalle Street,60602,united states,illinois,chicago,121 north lasalle street,"[united, states]",[chicago],[illinois],"[121, north, lasalle, street]","[united, state]",[chicago],[illinois],"[121, north, lasalle, street]"
3,4,United States,New York,Buffalo,65 Niagara Square,14202,united states,new york,buffalo,65 niagara square,"[united, states]",[buffalo],"[new, york]","[65, niagara, square]","[united, state]",[buffalo],"[new, york]","[65, niagara, square]"
4,5,United States,Ohio,Dayton,101 West Third Street,45402,united states,ohio,dayton,101 west third street,"[united, states]",[dayton],[ohio],"[101, west, third, street]","[united, state]",[dayton],[ohio],"[101, west, third, street]"


In [16]:
tokens_clean = sum(data['Country_lemmatized'], []) + sum(data['State_lemmatized'], []) + sum(data['Address_lemmatized'], []) + sum(data['City_lemmatized'], [])

In [17]:
data.head()

Unnamed: 0,BranchID,Country,State,City,Address,ZIPCODE,Country_lowercase,State_lowercase,City_lowercase,Address_lowercase,Country_tokenized,City_tokenized,State_tokenized,Address_tokenized,Country_lemmatized,City_lemmatized,State_lemmatized,Address_lemmatized
0,1,United States,Illinois,Skokie,5127 Oakton Street,60077,united states,illinois,skokie,5127 oakton street,"[united, states]",[skokie],[illinois],"[5127, oakton, street]","[united, state]",[skokie],[illinois],"[5127, oakton, street]"
1,2,United States,Wisconsin,Waukesha,201 Delafield Street,53188,united states,wisconsin,waukesha,201 delafield street,"[united, states]",[waukesha],[wisconsin],"[201, delafield, street]","[united, state]",[waukesha],[wisconsin],"[201, delafield, street]"
2,3,United States,Illinois,Chicago,121 North LaSalle Street,60602,united states,illinois,chicago,121 north lasalle street,"[united, states]",[chicago],[illinois],"[121, north, lasalle, street]","[united, state]",[chicago],[illinois],"[121, north, lasalle, street]"
3,4,United States,New York,Buffalo,65 Niagara Square,14202,united states,new york,buffalo,65 niagara square,"[united, states]",[buffalo],"[new, york]","[65, niagara, square]","[united, state]",[buffalo],"[new, york]","[65, niagara, square]"
4,5,United States,Ohio,Dayton,101 West Third Street,45402,united states,ohio,dayton,101 west third street,"[united, states]",[dayton],[ohio],"[101, west, third, street]","[united, state]",[dayton],[ohio],"[101, west, third, street]"


In [18]:
nlp = spacy.load("en_core_web_sm")


In [19]:
spacy_doc = nlp(' '.join(tokens_clean))

In [20]:
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df,
                       pd.DataFrame.from_records([{'token': token.text,'pos_tag': token.pos_}])], ignore_index=True)

In [21]:
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
78,state,PROPN,20
83,united,PROPN,20
80,street,PROPN,8
21,avenue,PROPN,4
79,street,NOUN,4
41,illinois,PROPN,3
89,west,PROPN,3
66,ohio,PROPN,3
64,north,PROPN,2
77,st,PROPN,2


In [22]:
pos_df_counts.head()

Unnamed: 0,token,pos_tag,counts
78,state,PROPN,20
83,united,PROPN,20
80,street,PROPN,8
21,avenue,PROPN,4
79,street,NOUN,4


In [23]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records(
            [{'token': token.text, 'ner_tag': token.label_}])], ignore_index=True)

In [24]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,united state,ORG
1,united state,ORG
2,united state,ORG
3,united state,ORG
4,united state,ORG


In [25]:
ner_df_counts = ner_df.groupby(['token','ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
40,united state,ORG,20
0,101,CARDINAL,2
32,ohio,GPE,2
43,wisconsin,GPE,2
4,15,CARDINAL,1
5,2,CARDINAL,1
26,louis urbana marquette,PERSON,1
27,michigan,GPE,1
28,missouri,GPE,1
29,mulberry street,FAC,1


In [27]:
iris = load_iris()

In [28]:
x = iris.data
y = iris.target

In [29]:
print(x)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=45)

In [31]:
lr = LogisticRegression()

In [32]:
lr.fit(x_train, y_train)

In [33]:
y_pred = lr.predict(x_test)

In [34]:
accuracy_score(y_pred, y_test)

0.9555555555555556

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.87      1.00      0.93        13
           2       1.00      0.87      0.93        15

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.95        45
weighted avg       0.96      0.96      0.96        45



In [36]:
predicted_species = iris.target_names[y_pred]

In [37]:
actual_species = iris.target_names[y_test]

In [38]:
print("Predicted Species:",predicted_species  )

Predicted Species: ['setosa' 'setosa' 'virginica' 'setosa' 'setosa' 'setosa' 'setosa'
 'virginica' 'virginica' 'virginica' 'setosa' 'virginica' 'virginica'
 'virginica' 'versicolor' 'setosa' 'virginica' 'virginica' 'setosa'
 'versicolor' 'versicolor' 'versicolor' 'virginica' 'versicolor' 'setosa'
 'virginica' 'versicolor' 'versicolor' 'setosa' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'virginica' 'setosa' 'virginica' 'setosa'
 'setosa' 'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa'
 'versicolor' 'versicolor']


In [39]:
print("Actual Species:",actual_species )

Actual Species: ['setosa' 'setosa' 'virginica' 'setosa' 'setosa' 'setosa' 'setosa'
 'virginica' 'virginica' 'virginica' 'setosa' 'virginica' 'virginica'
 'virginica' 'virginica' 'setosa' 'virginica' 'virginica' 'setosa'
 'versicolor' 'versicolor' 'versicolor' 'virginica' 'versicolor' 'setosa'
 'virginica' 'versicolor' 'versicolor' 'setosa' 'versicolor' 'versicolor'
 'versicolor' 'virginica' 'virginica' 'setosa' 'virginica' 'setosa'
 'setosa' 'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa'
 'versicolor' 'versicolor']


In [40]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9555555555555556


In [41]:
print("Enter flower measurement: ")
sepal_length = float(input("Sepal length(cm): "))
sepal_width = float(input("Sepal width(cm):  "))
petal_length = float(input("Petal length(cm): "))
petal_width = float(input("Petal width(cm):  "))

Enter flower measurement: 


In [42]:
user_input = np.array([[sepal_length,sepal_width,petal_length,petal_width]])

In [43]:
prediction = lr.predict(user_input)

In [44]:
species_name = iris.target_names[prediction[0]]

In [45]:
print("Predicted Species: ", species_name)

Predicted Species:  setosa
