In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [17]:
df = pd.read_csv("data/train.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df.drop('Name', axis=1, inplace=True)
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,female,58.0,0,0,113783,26.55,C103,S


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183 entries, 1 to 889
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  183 non-null    int64  
 1   Survived     183 non-null    int64  
 2   Pclass       183 non-null    int64  
 3   Sex          183 non-null    object 
 4   Age          183 non-null    float64
 5   SibSp        183 non-null    int64  
 6   Parch        183 non-null    int64  
 7   Ticket       183 non-null    object 
 8   Fare         183 non-null    float64
 9   Cabin        183 non-null    object 
 10  Embarked     183 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 17.2+ KB


# Select your features (columns)

In [19]:
# get X and y variables, Assign X (data) and y (target)
X = df
y = df["Survived"]
print(X.shape, y.shape)

(183, 11) (183,)


In [20]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [21]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
230,231,1,1,female,35.0,1,0,36973,83.475,C83,S
724,725,1,1,male,27.0,1,0,113806,53.1,E8,S
257,258,1,1,female,30.0,0,0,110152,86.5,B77,S
434,435,0,1,male,50.0,1,0,13507,55.9,E44,S
195,196,1,1,female,58.0,0,0,PC 17569,146.5208,B80,C


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [22]:
# Scale your data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)


print(encoded_y_test)

ValueError: could not convert string to float: 'female'

# Train the Model



In [None]:
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
# This will take the SVC model and try each combination of parameters
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
predictions=grid.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions, ))

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'marge.sav'
joblib.dump(predictions, filename)