In [None]:
from google.colab import drive
drive.mount('/content/drive')

# LUNG CANCER CLASSIFICATION USING SUPPORT VECTOR MACHINE

## ***Note : Since I code on Colab, the path will be different. If you code on kaggle, you must change the path to be able to run this project.***

## IMPORT LIBRARY

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1. READ DATA FROM CSV FILE

In [None]:
pathData = '/content/drive/MyDrive/AI_project/Lung_cancer_classification_using_SVM/Dataset/lung_cancer.csv'

In [None]:
df = pd.read_csv(pathData)

In [None]:
df

In [None]:
df.info()

In [None]:
# Delete column name and surname
df.drop(columns=['Name','Surname'], inplace=True)

In [None]:
# Check the column information again
df.info()

## 2. VISUALIZING THE DATA

In [None]:
# Calculate the number of samples for each value of column "Result"
result_counts = df['Result'].value_counts()

# Draw a column chart
plt.figure(figsize=(8, 6))
result_counts.plot(kind='bar', color='skyblue')
plt.xlabel('Result')
plt.ylabel('Count')
plt.title('Distribution of Result')
plt.xticks(rotation=0)
plt.show()

### Draw a chart to analyze the data distribution as shown below

In [None]:
sns.set_theme(style="ticks")

sns.pairplot(df, hue="Result")

In [None]:

# Create X_train from columns Age, Smokes, AreaQ, and Alkhol
X_train = df[['Age', 'Smokes', 'AreaQ', 'Alkhol']]

# Create Y_train from column 'Result'
Y_train = df['Result']


In [None]:
X_train.shape
Y_train.shape

## 3. DATA FOR TRAINING AND TESTING

### Split the dataset into 2 sets of train and test with train = 80% and test = 20%, random_state = 42

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [None]:
print('X train shape: ', X_train.shape)
print('Y train shape: ', Y_train.shape)
print('X test shape: ', X_test.shape)
print('Y test shape: ', Y_test.shape)

## 4. Build model

In [None]:
from sklearn.svm import SVC

In [None]:
model = SVC(C=10, probability=True, random_state=9)

In [None]:
model = model.fit(X_train, Y_train)

In [None]:
model.predict(X_test)

In [None]:
model.predict_proba(X_test)

## 5. Results and reviews

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:

# Predict labels on the test set
Y_pred = model.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(Y_test, Y_pred)

# Draw confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()



In [None]:
from sklearn.metrics import classification_report

# Classification report
target_names = ['Not Sick', 'Sick'] #0 means not sick, 1 means sick
print(classification_report(Y_test, Y_pred, target_names=target_names))



In [None]:
from sklearn.metrics import accuracy_score

# Calculate the accuracy score on the test set
test_accuracy = accuracy_score(Y_test, Y_pred)
print("Test Accuracy:", test_accuracy)
