In [5]:
# Section 1: Feature Engineering & Model Tuning (Student Scores)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings('ignore')

# Load student scores dataset
student_df = pd.read_csv("C:\\Users\\saile\\Downloads\\student_scores.csv")

# Check for missing values
print("Missing Values:\n", student_df.isnull().sum())

# Feature Engineering: Create Total Score and Average Score
score_columns = [col for col in student_df.columns if col.lower() in ['math', 'science', 'english']]
student_df['Total_Score'] = student_df[score_columns].sum(axis=1)
student_df['Average_Score'] = student_df['Total_Score'] / len(score_columns)

print("\nUpdated Dataset:\n", student_df.head())

# Define features and target
features = score_columns + ['Total_Score', 'Average_Score']
target_col = 'FinalGrade' if 'FinalGrade' in student_df.columns else student_df.columns[-1]
X = student_df[features]
y = student_df[target_col]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

# Hyperparameter Tuning with GridSearchCV (Random Forest)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model and evaluation
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Optimized RMSE: {rmse:.2f}")
print(f"Optimized R² Score: {r2:.2f}")

# Section 2: Fraud Detection with Decision Trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load fraud detection dataset
df = pd.read_csv("C:\\Users\\saile\\Downloads\\fraud_detection.csv")
print("\nMissing Values:\n", df.isnull().sum())
df = df.dropna()

# Encode categorical variables
le = LabelEncoder()
df['Type_encoded'] = le.fit_transform(df['Type'])

# Feature Engineering: Amount squared
df['Amount_squared'] = df['Amount'] ** 2

# Define features and target
features = ['Amount', 'Type_encoded', 'Amount_squared']
X = df[features]
y = df['Is Fraud']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_dt = grid_search.best_estimator_

# Evaluate Model Performance
y_pred = best_dt.predict(X_test)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
print('Classification Report:')
print(classification_report(y_test, y_pred, zero_division=0))
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

# Recommendations for improvement
print('Recommendations:')
print('- Try more advanced feature engineering (e.g., transaction frequency, time-based features).')
print('- Use ensemble methods like Random Forest or boosting for better accuracy.')
print('- Address class imbalance if present using SMOTE or class weights.')

Missing Values:
 Student ID     0
Math           0
Science        0
English        0
Total_Score    0
dtype: int64

Updated Dataset:
   Student ID  Math  Science  English  Total_Score  Average_Score
0       S001    78       85       74          237           79.0
1       S002    56       62       59          177           59.0
2       S003    90       88       92          270           90.0
3       S004    70       65       72          207           69.0
4       S005    82       79       85          246           82.0
Training samples: 8, Testing samples: 2
Optimized RMSE: 5.40
Optimized R² Score: 0.09

Missing Values:
 Transaction ID    0
Amount            0
Type              0
Is Fraud          0
dtype: int64
Training samples: 8, Testing samples: 2
Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         