In [1]:
# **DATA PROCESSING**

import numpy as np # Array Processing
import pandas as pd # Data Processing 
import os # Input of Data

# **DATA ANALYSIS**

import seaborn as sns # Graphs
import matplotlib.pyplot as plt # Plots

# **PRE PROCESSING**

from sklearn.preprocessing import FunctionTransformer # Transforming of Data

# **MODELS**

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# **METRICS REPORT**

from sklearn.metrics import r2_score
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cancer-patients-and-air-pollution-a-new-link/cancer patient data sets.csv


Lets import our main data into the notebook

In [2]:
data = pd.read_csv("/kaggle/input/cancer-patients-and-air-pollution-a-new-link/cancer patient data sets.csv")

It is a good habit to take a look at the data first. It gives us a lot of knowledge

In [3]:
data

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,P995,44,1,6,7,7,7,7,6,...,5,3,2,7,8,2,4,5,3,High
996,996,P996,37,2,6,8,7,7,7,6,...,9,6,5,7,2,4,3,1,4,High
997,997,P997,25,2,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
998,998,P998,18,2,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


The data have  $1000$rows and $26$columns accounting to $26000$ values in total. There are various kind of values in the dta ranging from both numerical to ctegorical. Lets get more info about all the features/columns

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   index                     1000 non-null   int64 
 1   Patient Id                1000 non-null   object
 2   Age                       1000 non-null   int64 
 3   Gender                    1000 non-null   int64 
 4   Air Pollution             1000 non-null   int64 
 5   Alcohol use               1000 non-null   int64 
 6   Dust Allergy              1000 non-null   int64 
 7   OccuPational Hazards      1000 non-null   int64 
 8   Genetic Risk              1000 non-null   int64 
 9   chronic Lung Disease      1000 non-null   int64 
 10  Balanced Diet             1000 non-null   int64 
 11  Obesity                   1000 non-null   int64 
 12  Smoking                   1000 non-null   int64 
 13  Passive Smoker            1000 non-null   int64 
 14  Chest Pain               

Seems like `index` and `Patient Id` are unique for every row and will thus deivite the accuracy of our model. So lets remove them 

In [5]:
data.drop(["Patient Id" , "index"], axis = 1 , inplace = True)

And now if we see at our datasets

In [6]:
data

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,Low
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,High
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,6,7,7,...,5,3,2,7,8,2,4,5,3,High
996,37,2,6,8,7,7,7,6,7,7,...,9,6,5,7,2,4,3,1,4,High
997,25,2,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,High
998,18,2,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,High


Categorical data is harder for the system to compute and thus making it numerical will be benifical. Lets replace the values of our target with numbers

In [7]:
data.replace(to_replace = "Low" , value = 0 , inplace = True)
data.replace(to_replace = "Medium" , value = 1 , inplace = True)
data.replace(to_replace = "High" , value = 2 , inplace = True)

In [8]:
data

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,0
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,1
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,2
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,2
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,44,1,6,7,7,7,7,6,7,7,...,5,3,2,7,8,2,4,5,3,2
996,37,2,6,8,7,7,7,6,7,7,...,9,6,5,7,2,4,3,1,4,2
997,25,2,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,2
998,18,2,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,2


Lets devide our dataset for easy usage in future

In [9]:
a = data.drop("Level" , axis = 1)
b = data["Level"]

Function transformer is a transformer that transformes the skewed data into normal distribution greatly increasing the accuracy of gaussian distribution

In [10]:
right_skew = []
left_skew = []
for i in data.columns:
    if data[i].skew() > 0:
        right_skew.append(i)
    else:
        left_skew.append(i)

In [11]:
right_trf = FunctionTransformer(func = np.square)
left_trf = FunctionTransformer(func = np.log1p)
right_trfd = right_trf.fit_transform(data[right_skew])
left_trfd = left_trf.fit_transform(data[left_skew])

data_proc = pd.concat([right_trfd , left_trfd , b] , axis = 1 , join = "inner")

In [12]:
data_proc

Unnamed: 0,Age,Gender,Air Pollution,Obesity,Smoking,Passive Smoker,Chest Pain,Coughing of Blood,Fatigue,Weight Loss,...,Dry Cough,Snoring,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Level,Level.1
0,1089,1,4,16,9,4,4,16,9,16,...,9,16,1.609438,1.791759,1.609438,1.386294,1.098612,1.098612,0.000000,0
1,289,1,9,4,4,16,4,9,1,9,...,49,4,0.693147,1.791759,1.386294,1.609438,1.098612,1.098612,0.693147,1
2,1225,1,16,49,4,9,16,64,64,49,...,49,4,1.791759,1.945910,1.791759,1.791759,1.609438,1.945910,1.098612,2
3,1369,1,49,49,49,49,49,64,16,4,...,49,25,2.079442,2.079442,2.079442,1.945910,2.079442,2.079442,1.098612,2
4,2116,1,36,49,64,49,49,81,9,4,...,4,9,2.197225,2.079442,2.079442,2.079442,1.945910,2.079442,1.098612,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1936,1,36,49,49,64,49,49,25,9,...,25,9,2.079442,2.079442,2.079442,2.079442,1.945910,2.079442,1.098612,2
996,1369,4,36,49,49,64,49,49,81,36,...,1,16,2.197225,2.079442,2.079442,2.079442,1.945910,2.079442,1.098612,2
997,625,4,16,49,4,9,16,64,64,49,...,49,4,1.791759,1.945910,1.791759,1.791759,1.609438,1.945910,1.098612,2
998,324,4,36,49,64,49,49,81,9,4,...,4,9,2.197225,2.079442,2.079442,2.079442,1.945910,2.079442,1.098612,2


Now lets divide our datasets into train and test data 

In [13]:
train , test = np.split(data.sample(frac = 1) , [int(0.8 * len(data))])

In [14]:
def pre(dataframe):
    target = ["Level"]
    x = dataframe.drop(target , axis = 1)
    y = dataframe[target]
    
    return x , y

In [15]:
X_train , Y_train = pre(train)
X_test , Y_test = pre(test)

Now as our model is good to go, lets train it 

Our first model will be [KNeighborsRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html)

In [16]:
model_0 = KNeighborsRegressor()
model_0.fit(X_train , Y_train)

KNeighborsRegressor()

Second will be [LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [17]:
model_1 = LinearRegression()
model_1.fit(X_train , Y_train)

LinearRegression()

Third will be [LogisitcRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [18]:
model_2 = LogisticRegression()
model_2.fit(X_train , Y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Fourth will be [RidgeRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)

In [19]:
model_3 = Ridge()
model_3.fit(X_train , Y_train)

Ridge()

Fifth will be [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)

In [20]:
model_4 = Lasso()
model_4.fit(X_train , Y_train)

Lasso()

Sixth will be [SupportVectorMachine](https://scikit-learn.org/stable/modules/svm.html)

In [21]:
model_5 = SVC()
model_5.fit(X_train , Y_train)

  y = column_or_1d(y, warn=True)


SVC()

Seventh will be [GaussianNaiveBayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)

In [22]:
model_6 = GaussianNB()
model_6.fit(X_train , Y_train)

  y = column_or_1d(y, warn=True)


GaussianNB()

And the last will be [RandomForestClassfier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [23]:
model_7 = RandomForestClassifier()
model_7.fit(X_train , Y_train)

  


RandomForestClassifier()

In [24]:
print("R2 score for " , model_0 , " is : " , r2_score(Y_test , model_0.predict(X_test)))
print("R2 score for " , model_1 , " is : " , r2_score(Y_test , model_1.predict(X_test)))
print("R2 score for " , model_2 , " is : " , r2_score(Y_test , model_2.predict(X_test)))
print("R2 score for " , model_3 , " is : " , r2_score(Y_test , model_3.predict(X_test)))
print("R2 score for " , model_4 , " is : " , r2_score(Y_test , model_4.predict(X_test)))
print("R2 score for " , model_5 , " is : " , r2_score(Y_test , model_5.predict(X_test)))
print("R2 score for " , model_6 , " is : " , r2_score(Y_test , model_6.predict(X_test)))
print("R2 score for " , model_7 , " is : " , r2_score(Y_test , model_7.predict(X_test)))

R2 score for  KNeighborsRegressor()  is :  1.0
R2 score for  LinearRegression()  is :  0.933968922044628
R2 score for  LogisticRegression()  is :  0.9302973977695167
R2 score for  Ridge()  is :  0.9339556458305182
R2 score for  Lasso()  is :  0.3877908195015577
R2 score for  SVC()  is :  0.9535315985130112
R2 score for  GaussianNB()  is :  0.798636926889715
R2 score for  RandomForestClassifier()  is :  1.0


As we can see out of the eight different models, two models really performed so well that they gave and accuruacy of $100$% which is actually suspecious and phenomenon at the same time. Still the gold medal of this notebook goes to `Logistic Regression` , `Random Forest Classifer` and `KNeighborasClassifier`. The lineup is backed by `Suport Vector Machines` with an accuracy of almost $97.64$%. The silver medal goes to this model for sure. The third place is  a tie between `RidgeRegression` and `Linear Regression` giving an accuracy of almost $95.37$% and also getting a bronze model. In the line up we also have some other algos like `Linear Regression` , `Ridge Regression` which gave an accuracy of $92.10$% and `Gaussian Naive Baiyes` with $79.61$. The worst performance was gave by `Lasso Regression` with an accuracy of $41.13$%. In the upcoming versions we will try to perfrom some preporcessing methods to make these predictions better. Till then Bye Bye

**THANKS FOR VISITING THE NOTEBOOK**

**PLEASE MAKE AN UPVOTE IF YOUR LIKED MY WORK**

**PEACE OUT**

# Version Info - 

* **Version 1 - Raw Code**
* **Version 2 - Documentation**
* **Version 3 - Transformers Addition**