In [14]:
# Uncomment below if you want to run this file only
%run main.ipynb
#%run data_cleaning.ipynb
#%run data_visualization.ipynb
#%run feature_engineering.ipynb

In [15]:
# Read the CSV From FEATURE ENGINEERING data source file from S3 into a DataFrame
# Use the methods from the S3Utils class
if s3_utils.check_file_exists(output_file_key_data_feature_engineering):
    data = s3_utils.read_csv_from_s3(output_file_key_data_feature_engineering)

In [16]:
# Separate features and target
X = data.drop('target', axis=1)  # Replace 'target' with the name of your target column
y = data['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
from sklearn.svm import LinearSVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Assuming X_train, X_test, y_train, and y_test are predefined

# Preprocess the data
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
smote = SMOTE(random_state=42, n_jobs=-1)

# Fit the imputer and scaler on the training data and transform both training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

# Fit PCA on the resampled training data and transform both training and test data
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_res)
X_test_pca = pca.transform(X_test_scaled)

# Train the SVM model using LinearSVC
model = LinearSVC(class_weight='balanced', dual=False, random_state=42)
model.fit(X_train_pca, y_train_res)

# Make predictions on the test set
y_pred = model.predict(X_test_pca)

# Save the predictions to a CSV file
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predictions_df.to_csv('svm_predictions.csv', index=False)

# Save the trained model and preprocessing objects
with open('model_pipeline.pkl', 'wb') as file:
    pickle.dump({'model': model, 'imputer': imputer, 'scaler': scaler, 'pca': pca, 'smote': smote}, file)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate and display precision, recall, F1 score, and ROC AUC
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred) if len(np.unique(y_test)) == 2 else "N/A"

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

# Hyperparameter tuning with RandomizedSearchCV
param_distributions = {
    
    'C': np.logspace(-3, 3, 7)
}

random_search = RandomizedSearchCV(model, param_distributions, n_iter=10, refit=True, verbose=2, random_state=42)
random_search.fit(X_train_res, y_train_res)

# Evaluate the best model found by random search
best_model = random_search.best_estimator_
best_predictions = best_model.predict(X_test_pca)
print(classification_report(y_test, best_predictions))

# Calculate and display precision, recall, F1 score, and ROC AUC for the best model
best_precision = precision_score(y_test, best_predictions)
best_recall = recall_score(y_test, best_predictions)
best_f1 = f1_score(y_test, best_predictions)
best_roc_auc = roc_auc_score(y_test, best_predictions) if len(np.unique(y_test)) == 2 else "N/A"

print(f"Best Model Precision: {best_precision}")
print(f"Best Model Recall: {best_recall}")
print(f"Best Model F1 Score: {best_f1}")
print(f"Best Model ROC AUC: {best_roc_auc}")




[[36826 28487]
 [ 1643  4082]]
              precision    recall  f1-score   support

           0       0.96      0.56      0.71     65313
           1       0.13      0.71      0.21      5725

    accuracy                           0.58     71038
   macro avg       0.54      0.64      0.46     71038
weighted avg       0.89      0.58      0.67     71038

Precision: 0.12533390647548281
Recall: 0.7130131004366812
F1 Score: 0.21319266725857836
ROC AUC: 0.6384259230843857
Fitting 5 folds for each of 7 candidates, totalling 35 fits




[CV] END ............................................C=0.001; total time=   1.2s
[CV] END ............................................C=0.001; total time=   1.3s
[CV] END ............................................C=0.001; total time=   1.3s
[CV] END ............................................C=0.001; total time=   1.2s
[CV] END ............................................C=0.001; total time=   1.3s
[CV] END .............................................C=0.01; total time=   1.2s
[CV] END .............................................C=0.01; total time=   1.2s
[CV] END .............................................C=0.01; total time=   1.3s
[CV] END .............................................C=0.01; total time=   1.2s
[CV] END .............................................C=0.01; total time=   1.2s
[CV] END ..............................................C=0.1; total time=   1.2s
[CV] END ..............................................C=0.1; total time=   1.2s
[CV] END ...................