In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [40]:
df = pd.read_csv('ML internship/project 2/fraudTest.csv')
print(df.head())

   rank trans_date_trans_time        cc_num  \
0     0      21-06-2020 12:14  2.291164e+15   
1     1      21-06-2020 12:14  3.573030e+15   
2     2      21-06-2020 12:14  3.598215e+15   
3     3      21-06-2020 12:15  3.591920e+15   
4     4      21-06-2020 12:15  3.526826e+15   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F             3638 Marsh Union  ...  40.3207 -110.4360   
2     Lopez      F         9333 Valentine Point

In [42]:
df_sample = df.sample(n=1000, random_state=42)

In [46]:
df_sample['trans_date_trans_time'] = pd.to_datetime(df_sample['trans_date_trans_time'], format='%d-%m-%Y %H:%M')

# Extract useful features from datetime column
df_sample['year'] = df_sample['trans_date_trans_time'].dt.year
df_sample['month'] = df_sample['trans_date_trans_time'].dt.month
df_sample['day'] = df_sample['trans_date_trans_time'].dt.day
df_sample['hour'] = df_sample['trans_date_trans_time'].dt.hour
df_sample['minute'] = df_sample['trans_date_trans_time'].dt.minute

# Drop the original datetime column
df_sample = df_sample.drop(['trans_date_trans_time'], axis=1)

In [48]:
df_sample = pd.get_dummies(df_sample, columns=['merchant', 'category', 'gender', 'street'])

In [50]:
X = df_sample.drop(['rank'], axis=1)  # Dropping the 'rank' column as it seems to be an identifier
y = df_sample['rank']

In [54]:
# Define preprocessing pipelines
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [56]:
X = preprocessor.fit_transform(X)

In [58]:
print(pd.DataFrame(X).head())

                                                   0
0    (0, 0)\t-0.30948847480012714\n  (0, 1)\t-0.3...
1    (0, 0)\t-0.3097608224387668\n  (0, 1)\t0.113...
2    (0, 0)\t-0.3097608504155099\n  (0, 1)\t-0.22...
3    (0, 0)\t-0.3097608224387668\n  (0, 1)\t-0.00...
4    (0, 0)\t-0.30978132745584297\n  (0, 1)\t-0.3...


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
sgd = SGDClassifier(max_iter=1000)
sgd.fit(X_train, y_train)

In [64]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print('Classification Report:\n', classification_report(y_test, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

In [66]:
print('SGD Classifier:')
evaluate_model(sgd, X_test, y_test)

SGD Classifier:
Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

        1322       0.00      0.00      0.00       0.0
        1481       0.00      0.00      0.00       0.0
        4934       0.00      0.00      0.00       0.0
        5535       0.00      0.00      0.00       1.0
        5982       0.00      0.00      0.00       1.0
        8644       0.00      0.00      0.00       1.0
        8756       0.00      0.00      0.00       0.0
        9069       0.00      0.00      0.00       1.0
       10871       0.00      0.00      0.00       0.0
       12503       0.00      0.00      0.00       1.0
       13042       0.00      0.00      0.00       1.0
       14016       0.00      0.00      0.00       1.0
       14299       0.00      0.00      0.00       1.0
       16633       0.00      0.00      0.00       1.0
       22562       0.00      0.00      0.00       1.0
       27038       0.00      0.00      0.00       1.0
       27827       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
