In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [49]:
train_df = pd.read_csv('training.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df

In [None]:
train_df.describe()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
train_df.isnull().sum()

In [None]:
train_df.patient_race.value_counts()

In [None]:
train_df['patient_race'].fillna('not_given', inplace=True)

In [None]:
train_df['payer_type'].value_counts()

In [None]:
train_df['payer_type'].fillna('none_given', inplace=True)
train_df['Region'].fillna('none_given', inplace=True)
train_df['Division'].fillna('none_given', inplace=True)

In [None]:
train_df['patient_state'].value_counts()

In [None]:
max_state = train_df['patient_state'].mode()[0]
train_df['patient_state'].fillna(max_state, inplace=True)

In [None]:
median_bmi = train_df['bmi'].median()
train_df['bmi'].fillna(median_bmi, inplace=True)

In [None]:
medians = train_df.median()
train_df.fillna(medians, inplace=True)

In [None]:
train_df['metastatic_first_novel_treatment'].value_counts()

In [None]:
train_df['metastatic_first_novel_treatment_type'].value_counts()

In [None]:
train_df['DiagPeriodL90D'].value_counts()

In [None]:
train_df.drop(columns=['patient_gender'], inplace=True)
test_df.drop(columns=['patient_gender'], inplace=True)

In [None]:
race = train_df['patient_race'].value_counts()
plt.figure(figsize=(7, 6))
ax = race.plot(kind='bar', rot=0, color='c')
ax.set_title('Race Distribution')
ax.set_xlabel('Race')
ax.set_ylabel('Count')
for rect in ax.patches:
    y_value = rect.get_height()
    x_value = rect.get_x() + rect.get_width()
    space = 1
    label = format(y_value)
    ax.annotate(label, (x_value, y_value))
    
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='patient_race', hue='DiagPeriodL90D', data=train_df)
plt.title('Relationship between Patient Race and Diagnosis')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='DiagPeriodL90D', y='bmi', data=train_df, palette='viridis')
plt.title('Relationship between BMI and Diagnosis')
plt.xlabel('Diagnosis')
plt.ylabel('BMI')
plt.show()

In [None]:
train_df.dtypes

In [None]:
train_df['metastatic_first_novel_treatment_type'].value_counts()

In [None]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [None]:
train_df['breast_cancer_diagnosis_desc'] = train_df['breast_cancer_diagnosis_desc'].apply(lambda x: nlp(x).vector)

In [None]:
test_df['breast_cancer_diagnosis_desc'] = test_df['breast_cancer_diagnosis_desc'].apply(lambda x: nlp(x).vector)

In [None]:
train_df.drop(columns=['breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type'], inplace=True)
test_df.drop(columns=['breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type'], inplace=True)

In [None]:
train_df['par']

In [None]:
df = pd.concat([df, pd.get_dummies(df['patient_race'], prefix='patient_race')], axis=1)

# Drop the original 'patient_race' column
df.drop(columns=['patient_race'], inplace=True)

# Display the updated DataFrame
print(df