In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
import sklearn

# Data Reading

In [None]:
train = pd.read_csv('train_2v.csv')
test = pd.read_csv('test_2v.csv')
train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
test.shape

# Data Cleaning


**Identifing missing attributes**

In [None]:
train_missing_values=train.isnull().sum()
train_missing_values

In [None]:
test_missing_values=test.isnull().sum()
test_missing_values

In [None]:
ms.matrix(train)

**Removing missing values**

In [None]:
train_data=train.dropna(axis=0,how="any")
test_data=test.dropna(axis=0,how="any")
print('train data shape: {}' .format(train_data.shape))
print('test data shape: {}' .format(test_data.shape))

In [None]:
ms.matrix(train_data)

In [None]:
ms.matrix(test_data)

# Pattern Recognition

In [None]:
train_data["stroke"].value_counts()

In [None]:
sns.countplot(x=train_data["stroke"])
plt.title("no of patients affected by stroke", fontsize=15)
plt.show()

In [None]:
sns.countplot(x=train_data["gender"], hue=train_data["stroke"])
plt.title("gender vs stroke", fontsize=15)
plt.show()

In [None]:
train_data.groupby(["gender"])["stroke"].value_counts()


In [None]:
train_data["smoking_status"].value_counts()

In [None]:
train_data.groupby(["gender"])["smoking_status"].value_counts()


In [None]:
sns.countplot(x=train_data["gender"], hue=train_data["smoking_status"])
plt.title("gender vs type of smokers", fontsize=15)
plt.show()

### conversion of string type to int type

In [None]:
str_data=train_data.select_dtypes(include=['object'])
str_dt=test_data.select_dtypes(include=['object'])

In [None]:
int_data=train_data.select_dtypes(include=['integer', 'float'])
int_dt=test_data.select_dtypes(include=['integer', 'float'])


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label=LabelEncoder()
features=str_data.apply(label.fit_transform)
features=features.join(int_data)
features.head()

In [None]:
test1=str_dt.apply(label.fit_transform)
Test=test1.join(int_dt)
Test.head()

# Modelling & predicting the data

In [None]:
xtrain=features.drop(["stroke"],axis=1)
xtrain.shape

In [None]:
ytrain=features["stroke"]
ytrain.head()
ytrain.shape

In [None]:

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(xtrain, ytrain)

In [None]:
x_test.shape

In [None]:
y_test.shape

In [None]:
x_train.head()

In [None]:
y_train.head()

## Naive_Bayes

In [None]:
x_test.head()

In [None]:
y_test.head()

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model=GaussianNB()
model.fit(x_train, y_train)

In [None]:
predict=model.predict(x_test)
predict

In [None]:
test_score=model.score(x_test, y_test)
print("NBtest_score:", test_score)

### Confusion matrix

In [None]:
nb_conf_mtr=pd.crosstab(y_test, predict)
nb_conf_mtr

### Report for Naive_Bayes

In [None]:
from sklearn.metrics import classification_report

In [None]:
nbreport=classification_report(y_test, predict)
print(nbreport)


## Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt_mod=DecisionTreeClassifier()
dt_mod.fit(x_train, y_train)

In [None]:
y_predict=dt_mod.predict(x_test)
y_predict

In [None]:
ts_dt_score=dt_mod.score(x_test, y_test)
print("Decision tree test score:", ts_dt_score)

### Reports for decision tree

In [None]:
dectree_report=classification_report(y_test, y_predict)
print(dectree_report)

In [None]:
dt_conf_mtr=pd.crosstab(y_test, y_predict)
dt_conf_mtr

### Random Forest

In [None]:

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators= 100)