In [None]:
""" SailGP tack analysis project.
    -----------------------------
    Notebook which imports the dataset, performs
    some EDA, generates some features, builds some
    candidate models, evaluates model accuracy etc
    and interprets the model results """

# 1. Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import shap

# 2. Load the Dataset
df = pd.read_csv('your_dataset.csv')

# 3. Exploratory Data Analysis (EDA)

# 3.1 Basic Information
print(df.info())
print(df.describe())

# 3.2 Missing Values
missing_values = df.isnull().sum()
print(missing_values)

# 3.3 Target Variable Distribution
sns.countplot(x='target', data=df)
plt.title('Target Variable Distribution')
plt.show()

# 3.4 Feature Distributions
df.hist(bins=50, figsize=(20, 15))
plt.show()

# 3.5 Correlation Matrix
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# 4. Feature Engineering

# 4.1 Handling Missing Values
# Fill missing values or drop missing data
df['column'] = df['column'].fillna(df['column'].mean())

# 4.2 Encoding Categorical Variables
categorical_features = df.select_dtypes(include=['object']).columns
for col in categorical_features:
    df[col] = df[col].astype('category').cat.codes

# 4.3 Feature Scaling
scaler = StandardScaler()
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# 5. Model Building

# 5.1 Split the Data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5.2 Define the Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 5.3 Train the Model
model.fit(X_train, y_train)

# 6. Model Evaluation

# 6.1 Predictions
y_pred = model.predict(X_test)

# 6.2 Classification Report
print(classification_report(y_test, y_pred))

# 6.3 Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# 7. Model Interpretation

# 7.1 Feature Importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12, 6))
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.show()

# 7.2 SHAP Values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# 7.3 SHAP Summary Plot
shap.summary_plot(shap_values, X_test)

# 7.4 SHAP Dependence Plot
shap.dependence_plot('feature_name', shap_values[1], X_test)

# 8. Conclusion and Next Steps
# Summarize findings, potential improvements, and next steps for further analysis
