# Titanic Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [4]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [5]:
dummies = pd.get_dummies(inputs.Sex)  # inputs.Sex = inputs.Sex.map({'male': 1, 'female': 2})
dummies.head(3)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0


In [6]:
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


<b>I am dropping male column as well because of dummy variable trap theory. One column is enough to repressent male vs female

In [7]:
inputs.drop(['Sex','male'],axis='columns',inplace=True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1


In [8]:
inputs.columns[inputs.isna().any()] # To check any NaN Values

Index(['Age'], dtype='object')

In [9]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [10]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.3)

In [12]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [13]:
model.fit(X_train,y_train)

GaussianNB()

In [14]:
model.score(X_test,y_test)

0.7798507462686567

In [15]:
X_test[0:10]

Unnamed: 0,Pclass,Age,Fare,female
829,1,62.0,80.0,1
146,3,27.0,7.7958,0
493,1,71.0,49.5042,0
442,3,25.0,7.775,0
691,3,4.0,13.4167,1
198,3,29.699118,7.75,1
875,3,15.0,7.225,1
358,3,29.699118,7.8792,1
887,1,19.0,30.0,1
745,1,70.0,71.0,0


In [16]:
y_test[0:10]

829    1
146    1
493    0
442    0
691    1
198    1
875    1
358    1
887    1
745    0
Name: Survived, dtype: int64

In [17]:
model.predict(X_test[0:10])

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)

<b>Calculate the score using cross validation

In [18]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)

array([0.752     , 0.768     , 0.76      , 0.75806452, 0.83064516])

---

# Spam Detection

In [19]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [21]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0) # Converting categories into numbers in category column
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam, test_size=0.25)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer # To convert text in message column to numbers using countvectorizer method
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)

MultinomialNB()

In [25]:
#Predicting model
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [26]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9834888729361091

<b> Sklearn Pipeline

In [27]:
from sklearn.pipeline import Pipeline  # internally this pipeline will convert text into numbers and then will apply model
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [28]:
clf.fit(X_train, y_train) # internally this pipeline will convert text into numbers and then will apply model, therefore X_train is used instead of X_train_count

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [29]:
clf.score(X_test,y_test)

0.9834888729361091

In [30]:
clf.predict(emails)

array([0, 1], dtype=int64)

---

# Wine Dataset

<b> Use wine dataset from sklearn.datasets to classify wines into 3 categories. Load the dataset and split it into test and train. After that train the model using Gaussian and Multinominal classifier and post which model performs better. Use the trained model to perform some predictions on test data.

In [31]:
from sklearn import datasets
wine = datasets.load_wine()

In [32]:
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [33]:
wine.data[0:2]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03]])

In [34]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [35]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [36]:
wine.target[0:2]

array([0, 0])

In [37]:
import pandas as pd
df = pd.DataFrame(wine.data,columns=wine.feature_names)
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [38]:
df['target'] = wine.target
df[50:70]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
50,13.05,1.73,2.04,12.4,92.0,2.72,3.27,0.17,2.91,7.2,1.12,2.91,1150.0,0
51,13.83,1.65,2.6,17.2,94.0,2.45,2.99,0.22,2.29,5.6,1.24,3.37,1265.0,0
52,13.82,1.75,2.42,14.0,111.0,3.88,3.74,0.32,1.87,7.05,1.01,3.26,1190.0,0
53,13.77,1.9,2.68,17.1,115.0,3.0,2.79,0.39,1.68,6.3,1.13,2.93,1375.0,0
54,13.74,1.67,2.25,16.4,118.0,2.6,2.9,0.21,1.62,5.85,0.92,3.2,1060.0,0
55,13.56,1.73,2.46,20.5,116.0,2.96,2.78,0.2,2.45,6.25,0.98,3.03,1120.0,0
56,14.22,1.7,2.3,16.3,118.0,3.2,3.0,0.26,2.03,6.38,0.94,3.31,970.0,0
57,13.29,1.97,2.68,16.8,102.0,3.0,3.23,0.31,1.66,6.0,1.07,2.84,1270.0,0
58,13.72,1.43,2.5,16.7,108.0,3.4,3.67,0.19,2.04,6.8,0.89,2.87,1285.0,0
59,12.37,0.94,1.36,10.6,88.0,1.98,0.57,0.28,0.42,1.95,1.05,1.82,520.0,1


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3, random_state=100)

In [40]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
model = GaussianNB()
model.fit(X_train,y_train)

GaussianNB()

In [41]:
model.score(X_test,y_test)

1.0

In [42]:
mn = MultinomialNB()
mn.fit(X_train,y_train)
mn.score(X_test,y_test)

0.7777777777777778

---

# Simple Exampel of Playing Tennis

In [43]:
import numpy as np

In [44]:
data = pd.read_csv('play_tennis.csv')

In [45]:
data.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [46]:
data.drop(['day'],axis=1,inplace=True)

In [47]:
# Problem 1:
# Outlook = Sunny, Temp = Hot, Humidity = High, Wind = Weak
# Play or Not Play ?

In [48]:
# Solution:
# P(Yes/Sunny,Hot,High,Weak) = P(Sunny/Yes) * P(Hot/Yes) * P(High/Yes) * P(Weak/Yes) * P(Yes)
# P(No/Sunny,Hot,High/Weak) = P(Sunny/No) * P(Hot/No) * P(High/No) * P(Weak/No) * P(No)
# Compare and decide using the maximum pasteriori rule

In [49]:
data['play'].value_counts()

Yes    9
No     5
Name: play, dtype: int64

In [50]:
py=9/14
pn=5/14

In [51]:
py,pn

(0.6428571428571429, 0.35714285714285715)

In [52]:
# Outlook
pd.crosstab(data['outlook'],data['play'])

play,No,Yes
outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
Overcast,0,4
Rain,2,3
Sunny,3,2


In [53]:
psn=3/5
psy=2/9

In [54]:
# Temperature
pd.crosstab(data['temp'],data['play'])

play,No,Yes
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
Cool,1,3
Hot,2,2
Mild,2,4


In [55]:
photn=2/5
photy=2/9

In [56]:
# Humidity
pd.crosstab(data['humidity'],data['play'])

play,No,Yes
humidity,Unnamed: 1_level_1,Unnamed: 2_level_1
High,4,3
Normal,1,6


In [57]:
phn=4/5
phy=3/9

In [58]:
# Wind
pd.crosstab(data['wind'],data['play'])

play,No,Yes
wind,Unnamed: 1_level_1,Unnamed: 2_level_1
Strong,3,3
Weak,2,6


In [59]:
pwn=2/5
pwy=6/9

In [81]:
# P(Yes/Sunny,Hot,High,Weak) = P(Sunny/Yes) * P(Hot/Yes) * P(High/Yes) * P(Weak/Yes) * P(Yes)
a = psy*photy*phy*pwy*py
-a

-0.007054673721340387

In [61]:
# P(No/Sunny,Hot,High/Weak) = P(Sunny/No) * P(Hot/No) * P(High/No) * P(Weak/No) * P(No)
b = psn*photn*phn*pwn*pn
b

0.02742857142857143

In [62]:
# Since b > a So NO PLAY

---

# Breast Cancer Detection

In [78]:
#load dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [80]:
data.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [82]:
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [83]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [84]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [85]:
# create dtaframe
df = pd.DataFrame(np.c_[data.data, data.target], columns=[list(data.feature_names)+['target']])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [86]:
df.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0.0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0.0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0.0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0.0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1.0


In [87]:
df.shape

(569, 31)

In [88]:
"""### Split Data"""
 
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)
 
print('Shape of X_train = ', X_train.shape)
print('Shape of y_train = ', y_train.shape)
print('Shape of X_test = ', X_test.shape)
print('Shape of y_test = ', y_test.shape)

Shape of X_train =  (455, 30)
Shape of y_train =  (455,)
Shape of X_test =  (114, 30)
Shape of y_test =  (114,)


In [90]:
"""## Train Naive Bayes Classifier Model"""
 
from sklearn.naive_bayes import GaussianNB
 
classifier = GaussianNB()
classifier.fit(X_train, y_train)
 
classifier.score(X_test, y_test)



0.9736842105263158

In [91]:
from sklearn.naive_bayes import MultinomialNB
classifier_m = MultinomialNB()
classifier_m.fit(X_train, y_train)
 
classifier_m.score(X_test, y_test)



0.8947368421052632

In [93]:
from sklearn.naive_bayes import BernoulliNB
classifier_b = BernoulliNB()
classifier_b.fit(X_train, y_train)
 
classifier_b.score(X_test, y_test)



0.5789473684210527

In [94]:
"""## Predict Cancer"""
 
patient1 = [17.99,
 10.38,
 122.8,
 1001.0,
 0.1184,
 0.2776,
 0.3001,
 0.1471,
 0.2419,
 0.07871,
 1.095,
 0.9053,
 8.589,
 153.4,
 0.006399,
 0.04904,
 0.05373,
 0.01587,
 0.03003,
 0.006193,
 25.38,
 17.33,
 184.6,
 2019.0,
 0.1622,
 0.6656,
 0.7119,
 0.2654,
 0.4601,
 0.1189]

In [95]:
patient1 = np.array([patient1])
patient1

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01]])

In [96]:
classifier.predict(patient1)

array([0.])

In [97]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [98]:
pred = classifier.predict(patient1)

In [99]:
if pred[0] == 0:
  print('Patient has Cancer (malignant tumor)')
else:
  print('Patient has no Cancer (malignant benign)')

Patient has Cancer (malignant tumor)


---

# Sentiment Analysis

In [63]:
df=pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [64]:
df['review'][0]  # One Review

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## Text Cleaning

<b> 1-Sample 1000 rows <br> `2-Remove html tags <br>
3-Remove Special characters <br>
4-Converting everything to lower case<br>
5-Removing stop words <br>
6-Stemming

In [65]:
df=df.sample(1000)

In [66]:
df.shape

(1000, 2)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 40602 to 42032
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   sentiment  1000 non-null   object
dtypes: object(2)
memory usage: 23.4+ KB


In [68]:
df.head()

Unnamed: 0,review,sentiment
40602,Night of the Comet starts as the world prepare...,negative
25801,Where's Michael Caine when you need him? I've ...,negative
24983,Having obtained a copy of Bostocks Cup I must ...,positive
31511,"Having read the novel before seeing this film,...",negative
38815,This movie is like Happiness meets Lost in Tra...,positive


In [69]:
df['sentiment'].replace({"positive":1,"negative":0},inplace=True)

In [70]:
df.head()

Unnamed: 0,review,sentiment
40602,Night of the Comet starts as the world prepare...,0
25801,Where's Michael Caine when you need him? I've ...,0
24983,Having obtained a copy of Bostocks Cup I must ...,1
31511,"Having read the novel before seeing this film,...",0
38815,This movie is like Happiness meets Lost in Tra...,1


In [71]:
import re
clean=re.compile('<.*?>:')
re.sub(clean,'',df.iloc[2].review)

"Having obtained a copy of Bostocks Cup I must confess It is not as funny as I originally thought!! IT IS BETTER!!!!! Charlie Williams ... eat your heart out. Match fixing???? Never! Sloping pitch at 45%? Ronnie and Reggie Kay? George Best? The Coach Driver who thinks Pontefract is in South Wales ( It's all Ponty this and Ponty that)Bertie Masson's (Tim Healey's)lucky Cup hat!! (not that he's into gimmicks) Sugar Plum Fairy????? Confused???? Watch it again. The innovative use of real footage with Bostock players was brilliant and the producer should be proud of giving us a MASTERPIECE. Come on ITV do the viewers and yourself a favour - show it again!!! Please>"

In [72]:
#Function to clean html tags
def clean_html (text):
    clean=re.compile('<.*?>:')
    re.sub(clean,'',df.iloc[2].review)

In [73]:
df['review']=df['review'].apply(clean_html)

In [74]:
#Converting everything to lower 
def convert_lower(text):
    return text.lower()

In [75]:
df['review']=df['review'].apply(convert_lower)

AttributeError: 'NoneType' object has no attribute 'lower'

In [76]:
#function to remove special characters
def remove_special (text):
    x=''
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x + ''
    return x

In [77]:
df['review']=df['review'].apply(remove_special)

TypeError: 'NoneType' object is not iterable