In [None]:
import pandas as pd

# Load both datasets
train_df = pd.read_csv('/content/fraudTrain.csv')
test_df = pd.read_csv('/content/fraudTest.csv')

# Quick preview
print("Train data:")
print(train_df.head())

print("\nTest data:")
print(test_df.head())

Train data:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 

In [None]:
drop_cols = [
    'Unnamed: 0', 'cc_num', 'trans_num', 'first', 'last', 'street',
    'trans_date_trans_time', 'dob'
]

train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

In [None]:
print("Train missing values:\n", train_df.isnull().sum())
print("\nTest missing values:\n", test_df.isnull().sum())


Train missing values:
 merchant      0
category      0
amt           0
gender        0
city          0
state         0
zip           1
lat           1
long          1
city_pop      1
job           1
unix_time     1
merch_lat     1
merch_long    1
is_fraud      1
dtype: int64

Test missing values:
 merchant      0
category      0
amt           0
gender        0
city          0
state         0
zip           0
lat           0
long          0
city_pop      0
job           0
unix_time     0
merch_lat     0
merch_long    0
is_fraud      0
dtype: int64


In [None]:
train_df.dropna(inplace=True)

In [None]:
print(train_df.dtypes)


merchant       object
category       object
amt           float64
gender         object
city           object
state          object
zip           float64
lat           float64
long          float64
city_pop      float64
job            object
unix_time     float64
merch_lat     float64
merch_long    float64
is_fraud      float64
dtype: object


In [None]:
for col in ['merchant', 'category', 'gender', 'city', 'state', 'job']:
    print(f"{col}: {train_df[col].nunique()} unique values")


merchant: 693 unique values
category: 14 unique values
gender: 2 unique values
city: 863 unique values
state: 51 unique values
job: 483 unique values


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


# Drop high-cardinality columns (too many unique values)
high_card_cols = ['merchant', 'city', 'job']
train_df.drop(columns=high_card_cols, inplace=True)
test_df.drop(columns=high_card_cols, inplace=True)

# One-hot encode low-cardinality categorical columns
cat_cols = ['gender', 'category', 'state']
train_df = pd.get_dummies(train_df, columns=cat_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)

# Align columns (in case test set is missing some dummy columns)
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

# Split into features and target
X_train = train_df.drop('is_fraud', axis=1)
y_train = train_df['is_fraud']

X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[518620  34954]
 [   582   1563]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97    553574
           1       0.04      0.73      0.08      2145

    accuracy                           0.94    555719
   macro avg       0.52      0.83      0.52    555719
weighted avg       1.00      0.94      0.96    555719



In [None]:
y_pred = model.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Accuracy: 0.936054012909402

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97    553574
           1       0.04      0.73      0.08      2145

    accuracy                           0.94    555719
   macro avg       0.52      0.83      0.52    555719
weighted avg       1.00      0.94      0.96    555719


🔍 Confusion Matrix:
 [[518620  34954]
 [   582   1563]]
