## Thyroid Cancer Recurrence Project

Cell 1 - Importing Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,confusion_matrix
import pickle

Cell 2 - Reading and Importing Data

In [None]:
dataset = pd.read_csv("dataset.csv")

Cell 3 - Mapping the features with categorical values that can be represented in binary terms

In [None]:
dataset['Recurred'] = dataset['Recurred'].map({'Yes': 1, 'No':0})

Cell 4 - Allocating specific features to variables

In [None]:
X = dataset.drop(columns = 'Recurred')
y = dataset['Recurred']

Cell 5 - Spliting the data for training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Cell 6 = Getting Feature Names

In [None]:
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function',
                        'Physical Examination', 'Adenopathy', 'Pathology',
                        'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response']

Cell 7 - Pipelining and training

In [None]:
preprocess = ColumnTransformer([
    ('encode', OneHotEncoder(sparse_output=False,handle_unknown='ignore'), categorical_columns)
],remainder='passthrough')

pipeline = Pipeline([
    ('encode', preprocess),
    ('logreg', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)

Cell 8 - Evaluation of the Model

In [None]:
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))
print("Precision:", (precision_score(y_test, y_pred)))
print("Recall:", recall_score(y_test, y_pred))

Cell 11 - Exporting pipeline

In [None]:
pickle.dump(pipeline, open('pipeline.pkl', 'wb'))