<a href="https://colab.research.google.com/github/DMKkalle/MachineLearning/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

additional infos:
    
    https://scikit-learn.org/stable/modules/clustering.html#k-means

In [None]:
from fastai.tabular.all import *

from sklearn.datasets import load_iris
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import seaborn as sns

import pandas as pd
from pathlib import Path

In [None]:
# get the data and check file structure

path = Path('/content/')
print(list(path.iterdir()))
path.ls()

In [None]:
# load data in pandas and look at the first examples
df = pd.read_csv(path/'titanic.csv')
df.head()
df.info()

In [None]:
#definera funktioner för använding med TabularPandas
df['age'] = pd.to_numeric(df['age'], errors='coerce') #omvandla till float
df['fare'] = pd.to_numeric(df['fare'], errors='coerce') #omvandla till float
df['age'] = df['age'].fillna(df['age'].median())  # Fyll saknade värden med median
df['fare'] = df['fare'].fillna(df['fare'].median())  # Fyll saknade värden med median


df['sex'] = df['sex'].astype('category')
df['embarked'] = df['embarked'].astype('category')

cat_names = ['pclass', 'sex', 'embarked', 'sibsp', 'parch'] #Kategoriska
cont_names = ['age', 'fare'] #Kontinuerliga
dep_var = 'survived' #Beroende variabel eller "MÅLVARIABEL"

##########################################################################






In [None]:
#barn som individer unde 18 år
df['is_child'] = df['age'] < 18

child_survival = df.groupby('is_child')['survived'].mean()
print("Survival rate for children vs adults:")
print(child_survival)

#Hur många barn och vuxna överlevde/dog
child_count = df.groupby(['is_child', 'survived'])['survived'].count()
print("Count of children and adults:")
print(child_count)

plt.figure(figsize=(8, 6))
sns.barplot(x='is_child', y='survived', data=df, errorbar=None, palette='viridis')
plt.title('Survival Rate: Children vs Adults')
plt.xlabel('age > 18 vs age < 18')
plt.ylabel('Survival Rate')
plt.xticks([0,1], ['Adult', 'Child'])
plt.show()

In [None]:


#boxplot för överlevnas baserat på familjemedmlemmar
plt.figure(figsize=(10, 6))
sns.barplot(x='parch', y='survived', data=df, errorbar=None, palette='viridis')
plt.title('Survival vs. Family Members on Board')
plt.xlabel('Number of Family Members')
plt.ylabel('Survival Rate')
plt.show()

In [None]:
from sklearn.cluster import KMeans

X_kmeans = df[['age', 'fare']].dropna() #Använd ålder och biljettpris

#Skapa K-Means-modellen
kmeans = KMeans(n_clusters=3, random_state=421337)
df['cluster'] = kmeans.fit_predict(X_kmeans)

#Visualisera kluster
plt.figure(figsize=(8, 6))
sns.scatterplot(x='age', y='fare', hue='cluster', data=df, palette='viridis')
plt.title('K-Means Clustering of Titanic Data')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()

In [None]:
# Lägg till kluster i DataFrame
df['cluster'] = kmeans.labels_

# Visa samband mellan kluster och överlevnad
print(df.groupby(['cluster', 'survived'])['survived'].count())

# Visualisera sambandet
sns.countplot(x='cluster', hue='survived', data=df)
plt.title('Cluster vs. Survival')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.legend(['Died', 'Survived'])
plt.show()


In [None]:
#Konfigurera TabularPandas

#Förbehandlingssteg
procs = [Categorify, FillMissing, Normalize]

#Splitta dataset i tränings och valideringsdelar
splits = RandomSplitter(valid_pct=0.2, seed = 421337)(range_of(df))

#Skapa TabulaPandas objektet
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names, y_names=dep_var, splits=splits)
to.show(5)

In [None]:
#DATALOADERS OCH TRÄNA MODELL
#bygger en tabulär modell och träna
dls = to.dataloaders()
learn = tabular_learner(dls, metrics=accuracy)
learn.fit_one_cycle(10, lr_max=1e-3) #10 epoch och lägre inlärningshastighet
learn.show_results()





In [None]:
sns.countplot(data=df, x='sex', hue='survived')
plt.title('Överlevnad per kön')
plt.show()

sns.barplot(data=df, x='pclass', y='survived')
plt.title('Överlevnad per Reseklass')
plt.show()

sns.boxplot(data=df, x='survived', y='age')
plt.title('Överlevnad per Ålder')
plt.show()

In [None]:
results = learn.validate()
print(f"Validation Loss: {results[0]:.4f}, Accuracy: {results[1]:.4f}")

Identify categorial and continous data. Last column is normally the class we want to predict

In [None]:
#skapa och träna en Decision Tree-modell
dt_model = DecisionTreeClassifier(max_depth=3, random_state=421337)
dt_model.fit(X_train, y_train)

#visualisera beslutsträdet
plt.figure(figsize=(12, 8))
plot_tree(dt_model, feature_names=X.columns, class_names=['Died', 'Survived'], filled=True)
plt.title("Beslutsträd för Titanic-Data")
plt.show()


In [None]:
#SKAPA OCH TRÖNA EN RANDOM FOREST-MODELL

rf_model = RandomForestClassifier(n_estimators=100, random_state=421337)
rf_model.fit(X_train, y_train)

#Gör förutsägelser
y_pred_rf = rf_model.predict(X_test)

#Utvärdera RAndom Forest
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

#SKAPA OCH VISA CONFUSION MATRIX

cm_rf = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', xticklabels=['Died', 'Survived'], yticklabels=['Died', 'Survived'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Random Forest')
plt.show()




In [None]:
### Förbereda dataset för scikit-learn
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]
y = df['survived']

#Omvandla kategoriska variabler till dummy-variabler för scikit-learn
X = pd.get_dummies(X, columns=['sex', 'embarked'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=421337)


In [None]:
#UTVÄRDERA MODELLEN KLASSIFICERINGSRAPPORT OCH CONFUSIONMATRIX

#Förutsägelser på testdatan
y_pred = dt_model.predict(X_test)

#KLAssificeringsrapport
print("Classification Report for Decision Tree:")
print(classification_report(y_test, y_pred))

#Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Died', 'Survived'], yticklabels=['Died', 'Survived'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Decision Tree')
plt.show()

# fastai Neural Network

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()
interp.print_classification_report()

In [None]:
row, clas, probs = learn.predict(df.iloc[0])

In [None]:
row.show()

In [None]:
clas, probs

In [None]:
test_df = df.copy()[:10]
test_df.drop(['salary'], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

learn.get_preds(dl=dl)

# Decision Tree Learner

see https://scikit-learn.org/stable/modules/tree.html#tree for a good explanation

In [None]:
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()

In [None]:
X_train

In [None]:


clf = DecisionTreeClassifier(max_leaf_nodes=5, random_state=0)
clf.fit(X_train, y_train)

plt.figure(figsize=(12,16))  # set plot size (denoted in inches)
tree.plot_tree(clf, fontsize=12, feature_names=to.x_names, class_names=to.vocab, filled=True)

In [None]:
print(classification_report(y_valid, clf.predict(X_valid), target_names=to.vocab))
ConfusionMatrixDisplay.from_estimator(clf, X_valid, y_valid)

In [None]:
feature_importance = pd.Series(clf.feature_importances_, index=X_train.columns) #list(zip(X_train.columns, clf.feature_importances_))
feature_importance

In [None]:
sns.set(rc={"figure.figsize":(20, 6)})
sns.barplot(x=X_train.columns, y=clf.feature_importances_, color='C2')

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=30)
rf.fit(X_train, y_train)

In [None]:
print(classification_report(y_valid, rf.predict(X_valid), target_names=to.vocab))
ConfusionMatrixDisplay.from_estimator(rf, X_valid, y_valid)

In [None]:
feature_importance = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(axis=0, ascending=False)
feature_importance

In [None]:
sns.set(rc={"figure.figsize":(20, 6)}) #width=3, #height=4
sns.barplot(x=feature_importance.index, y=feature_importance, orient = "v", color='C1')
# orient“v” | “h”, optional
# sns.barplot(x = counts.index, y = counts)