In [5]:
# Uncomment below if you want to run this file only
%run main.ipynb
#%run data_cleaning.ipynb
#%run data_visualization.ipynb
#%run feature_engineering.ipynb

In [6]:
# Read the CSV From FEATURE ENGINEERING data source file from S3 into a DataFrame
# Use the methods from the S3Utils class
if s3_utils.check_file_exists(output_file_key_data_feature_engineering):
    data = s3_utils.read_csv_from_s3(output_file_key_data_feature_engineering)

In [7]:
# Separate features and target
X = data.drop('target', axis=1)  # Replace 'target' with the name of your target column
y = data['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Preprocess the data
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Impute missing values in the test set using the same imputer
X_test_imputed = imputer.transform(X_test)

# Scale the imputed test data using the same scaler
X_test_scaled = scaler.transform(X_test_imputed)

# Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_res)
X_test_pca = pca.transform(X_test_scaled)  # Transform the test set with PCA

# Train the SVM model
model = SVC(kernel='linear', class_weight='balanced')
model.fit(X_train_pca, y_train_res)

# Make predictions on the test set
y_pred = model.predict(X_test_pca)

# Save the predictions to a CSV file
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predictions_df.to_csv('svm_improved_predictions.csv', index=False)

# Save the trained model and preprocessing objects
with open('model.pkl', 'wb') as file:
    pickle.dump({'model': model, 'imputer': imputer, 'scaler': scaler, 'pca': pca}, file)
