In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
!cp "/content/drive/My Drive/archive.zip" "/content/"



In [5]:
import zipfile
zip_path = "/content/archive.zip"
extract_path = "/content/"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [6]:
!find /content/ -name "*.csv"


/content/drive/MyDrive/dataset.csv
/content/fraudTest.csv
/content/fraudTrain.csv
/content/sample_data/california_housing_train.csv
/content/sample_data/california_housing_test.csv
/content/sample_data/mnist_test.csv
/content/sample_data/mnist_train_small.csv


In [7]:
import pandas as pd
df_train = pd.read_csv("/content/fraudTrain.csv")
df_test = pd.read_csv("/content/fraudTest.csv")
print(df_train.info())
print(df_test.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [8]:
print("Missing values in training data:\n", df_train.isnull().sum())
print("Missing values in test data:\n", df_test.isnull().sum())


Missing values in training data:
 Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64
Missing values in test data:
 Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state

In [9]:
print(df_train['is_fraud'].value_counts(normalize=True))
print(df_test['is_fraud'].value_counts(normalize=True))


is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64
is_fraud
0    0.99614
1    0.00386
Name: proportion, dtype: float64


In [10]:
df_train.drop(columns=['Unnamed: 0', 'first', 'last', 'street', 'dob'], inplace=True)
df_test.drop(columns=['Unnamed: 0', 'first', 'last', 'street', 'dob'], inplace=True)


In [11]:
df_train['trans_date_trans_time'] = pd.to_datetime(df_train['trans_date_trans_time'])
df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'])


In [12]:
df_train['transaction_hour'] = df_train['trans_date_trans_time'].dt.hour
df_train['transaction_day'] = df_train['trans_date_trans_time'].dt.day
df_train['transaction_month'] = df_train['trans_date_trans_time'].dt.month
df_train['transaction_weekday'] = df_train['trans_date_trans_time'].dt.weekday

df_test['transaction_hour'] = df_test['trans_date_trans_time'].dt.hour
df_test['transaction_day'] = df_test['trans_date_trans_time'].dt.day
df_test['transaction_month'] = df_test['trans_date_trans_time'].dt.month
df_test['transaction_weekday'] = df_test['trans_date_trans_time'].dt.weekday


In [13]:
print(df_train.columns)
print(df_test.columns)


Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'gender', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'transaction_hour', 'transaction_day', 'transaction_month',
       'transaction_weekday'],
      dtype='object')
Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'gender', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'transaction_hour', 'transaction_day', 'transaction_month',
       'transaction_weekday'],
      dtype='object')


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
numeric_columns = ['amt', 'lat', 'long', 'city_pop', 'zip']
categorical_columns = ['merchant', 'category', 'gender', 'city', 'state', 'job']
X_train = df_train[categorical_columns + numeric_columns]
y_train = df_train['is_fraud']
X_test = df_test[categorical_columns + numeric_columns]
y_test = df_test['is_fraud']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])


X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")


Processed training data shape: (1296675, 2153)
Processed test data shape: (555719, 2153)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
model = LogisticRegression(max_iter=1000)
model.fit(X_train_processed, y_train)
y_pred = model.predict(X_test_processed)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9955409118637297
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719

