In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
# read_csv
df = pd.read_csv("dataset/odors intnsity.csv")

# show the first ten roows
df.head(10)

Unnamed: 0,odor,sens,intensity
0,ajayeb,x32,64522
1,ajmal,x8,40510
2,amreaj,x31,58033
3,aood,x26,72103
4,ajayeb,x1,64558
5,amreaj,x4,58041
6,ajmal,x10,60487
7,ajayeb,x2,64556
8,amreaj,x9,38900
9,ajmal,x15,60528


In [3]:
# Remove missing values from dataset

# print number missing value before handling in each column
print("the sum of null values in dataset befor handling missing values:")
print(df.isnull().sum()) 

# drop missing values from dataset
df = df.dropna(axis=0) # remove rows that contain missing value
print("*" * 50)

# print number missing value after handling in each column
print("the sum of null values in dataset after handling missing values:")
print(df.isnull().sum())


the sum of null values in dataset befor handling missing values:
odor         0
sens         0
intensity    0
dtype: int64
**************************************************
the sum of null values in dataset after handling missing values:
odor         0
sens         0
intensity    0
dtype: int64


In [4]:
# remove duplicated rows

# remove duplicated rows fom dataset
df_duplicates_removed = df.drop_duplicates()

print("the shape of dataset befor remove duplicated rows is: {}".format(df.shape))
print("the shape of dataset after remove duplicated rows is: {}".format(df_duplicates_removed.shape))
print("number of rows that is removed is: {}".format(df.shape[0] - df_duplicates_removed.shape[0]))

the shape of dataset befor remove duplicated rows is: (160, 3)
the shape of dataset after remove duplicated rows is: (160, 3)
number of rows that is removed is: 0


In [5]:
# show correlated attributes 
correlated_features1 = set()
correlated_features2 = set()


# the matrix that represent correlation
correlation_matrix = df.corr()

# show correlation matrix
print("array of correlation between features is:")
#print(correlation_matrix)

# put features correlated bigger than 0.8 and less than -0.8 in two sets
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features1.add(colname)
        elif correlation_matrix.iloc[i, j] < -0.8:
            colname = correlation_matrix.columns[i]
            correlated_features2.add(colname)
            
# combination two sets in one set            
correlated_features1.update(correlated_features2)

# show features that correlated greater than 0.8 and less than -0.8
print("the correlated features that greater than 0.8 and less than -0.8 is: {}".format(correlated_features1))

array of correlation between features is:
the correlated features that greater than 0.8 and less than -0.8 is: set()


In [6]:
# convert categorical features to numerical
df["intensity"] = pd.to_numeric(df["intensity"])

# Creating a instance of label Encoder.
le = LabelEncoder()

# creating list that contain catefories features
categories = ["odor", "sens"]


# apply LabelEncoder on all categorical features
for item in categories:
    label = le.fit_transform(df[item])
    df[item] = label




# show data after that
print("this data after converting:")
df.head(10)


this data after converting:


Unnamed: 0,odor,sens,intensity
0,0,25,64522
1,1,38,40510
2,2,24,58033
3,3,18,72103
4,0,0,64558
5,2,33,58041
6,1,1,60487
7,0,11,64556
8,2,39,38900
9,1,6,60528


In [7]:
# show types of features
df.dtypes

odor         int32
sens         int32
intensity    int64
dtype: object

In [8]:
# splitting dataset to training data and testing data

# target column
target_col = "odor"

# X_train dataset
X_train = df.drop(columns=target_col)

# y_train dataset contains one feature ("price" only)
y_train = df.loc[:, target_col]

# split dataset with 70% training data and 30% testing data
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.3, random_state = 0) 

# print the shape for each dataset 
print(f"shape of X_train: {X_train.shape}")
print(f"shape of y_train: {y_train.shape}")
print(f"shape of X_test: {X_test.shape}")
print(f"shape of y_train: {y_test.shape}")

shape of X_train: (112, 2)
shape of y_train: (112,)
shape of X_test: (48, 2)
shape of y_train: (48,)


In [9]:
# K-Neighbors Algorithm
# Application of kneighbors algorithm in classification

neigh=KNeighborsClassifier()
neigh.fit(X_train,y_train)

# Calculate the expected value for y_pred
y_pred_neigh = neigh.predict(X_test)

# Calculate the accuracy of the application of the kneighbors algorithm
Accuracy = accuracy_score(y_pred_neigh,y_test)
print("Accuracy: {}".format(Accuracy * 100))

# calculate precision for kneighbors algorithm
# precision is: tp / (tp + fp)
precision = precision_score(y_test, y_pred_neigh, labels=[1,2], average='micro')
print('Precision: {}'.format(precision * 100))

# calculate recall for kneighbors algorithm
# recall is: tp / (tp + fn)
recall = recall_score(y_test, y_pred_neigh, average='micro')
print('Recall: {}'.format(recall * 100))

# calculate f1_score for kneighbors algorithm
# f1_score is: 2*tp / (2*tp + fp + fn)
score = f1_score(y_test, y_pred_neigh, average='micro')
print('F-Measure: {}'.format(score * 100))

Accuracy: 97.91666666666666
Precision: 100.0
Recall: 97.91666666666666
F-Measure: 97.91666666666666


In [10]:
#Application of DecisionTree algorithm in classification
tree=DecisionTreeClassifier()
tree.fit(X_train,y_train)

#Calculate the expected value for y_pred
y_pred_tree = tree.predict(X_test)

#Calculate the accuracy of the application of the DecisionTree algorithm
Accuracy = accuracy_score(y_pred_tree,y_test)
print("Accuracy: {}".format(Accuracy * 100))

# calculate precision for DecisionTree algorithm
# precision is: tp / (tp + fp)
precision = precision_score(y_test, y_pred_tree, labels=[1,2], average='micro')
print('Precision: {}'.format(precision * 100))

# calculate recall for DecisionTree algorithm
# recall is: tp / (tp + fn)
recall = recall_score(y_test, y_pred_tree, average='micro')
print('Recall: {}'.format(recall * 100))

# calculate f1_score for DecisionTree algorithm
# f1_score is: 2*tp / (2*tp + fp + fn)
score = f1_score(y_test, y_pred_tree, average='micro')
print('F-Measure: {}'.format(score * 100))

Accuracy: 95.83333333333334
Precision: 95.83333333333334
Recall: 95.83333333333334
F-Measure: 95.83333333333334
