# Lab Setup

In [None]:
import pandas as pd
import pygwalker as pyg
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [None]:
casualty = pd.read_csv('data/casualty_train.csv')


vehicle = pd.read_csv('data/vehicle_train.csv')


## Modify the settings

In [None]:
pd.set_option('display.max_colwidth', 500)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# EDA Checklist 

## INITIAL EXPLORATION - CASUALTY

In [None]:
casualty.head()

In [None]:
casualty.tail()

## INITIAL EXPLORATION - VEHICLE

In [None]:
vehicle.head()

In [None]:
vehicle.tail()

## DESCRIBE & SHAPE

In [None]:
casualty.shape

In [None]:
casualty.describe()

In [None]:
vehicle.shape

In [None]:
vehicle.describe()

## MISSING VALUES?

In [None]:
casualty.isnull().sum()

In [None]:
vehicle.isnull().sum()

## DUPLICATES?

In [None]:
vehicle.duplicated().sum()

In [None]:
casualty.duplicated().sum()

## DTYPES

In [None]:
vehicle.dtypes

In [None]:
casualty.dtypes

## VISUALISATION

In [None]:
# Create a histogram of the 'Age' column
gwalker_casualty = pyg.walk(casualty)


# MERGE THE VALUES

In [None]:
merged_df = pd.merge(casualty, vehicle, on=['accident_reference', 'vehicle_reference'], how='inner')

merged_df.head()

In [None]:
merged_df.shape

In [None]:
merged_df.describe()

In [None]:
df = merged_df

In [None]:
df.head(150)

# SELECT ONLY THE ROWS THAT !-1

In [None]:
# select only the rows that do not contain -1 values in any of the columns
df = df[(df != -1).all(axis=1)]

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.head(100)

# Features (x), Target (y)

In [None]:
# map the casualty_severity column to binary values of 1 (for "fatal" and "serious") and 0 (for "slight")
#df.loc['target'] = df['casualty_severity'].map({1: 1, 2: 1, 3: 0})

# create a new dataframe with only the target values (i.e., the target column)
y = df[['casualty_severity']]

# create a new dataframe with only the feature values (i.e., all columns except casualty_severity and target)
X = df.drop(['casualty_severity', 'casualty_severity'], axis=1)

In [None]:
y.head(150)

# Map target to binary 1 (fatal, serious) and 0 (slight).

In [None]:
y = y.replace({1: 1, 2:1, 3:0})

In [None]:
y.head(150)

In [None]:
y.dtypes

In [None]:
X.dtypes

In [None]:
X.head(100)

In [None]:
X = X.drop(['lsoa_of_casualty', 'lsoa_of_driver', 'generic_make_model', 'accident_reference'], axis=1)

In [None]:
X.head()

In [None]:
X.shape

# SVM 

# Load the test data & merge it

In [None]:
casualty_test = pd.read_csv('data/casualty_test.csv')


vehicle_test = pd.read_csv('data/vehicle_test.csv')


In [None]:
vehicle_test.tail()

In [None]:
merged_test = pd.merge(casualty_test, vehicle_test, on=['accident_reference', 'vehicle_reference'], how='inner')

In [None]:
merged_test.head()

In [None]:
merged_test.shape

In [None]:
df1 = merged_test

In [None]:
df1.shape

# Drop the columns you do not need


In [None]:
df1 = df1.drop(['lsoa_of_casualty', 'lsoa_of_driver', 'generic_make_model', 'accident_reference'], axis=1)

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
new_data = df1

# TRAIN & PREDICT

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Perform SMOTE oversampling on the training set
## This is done due to Class imbalance of the dataset
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Creating the SVM model
svm = SVC(kernel='linear', C=1.0, gamma='auto')

# Training the model on the resampled training data
svm.fit(X_train_resampled, y_train_resampled)

# Predicting on the test set
y_pred = svm.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
# Preprocess the new data...
new_data = scaler.transform(new_data)

# Predict the class labels of the new data
predicted_labels = svm.predict(new_data)

In [None]:
# create a dataframe with predicted labels
df1 = pd.DataFrame(predicted_labels, columns=['predicted_labels'])

# save the df
df1.to_csv('out/predicts.csv', index=False)

In [None]:
df1.tail()