In [37]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib


In [39]:

aim_data = pd.read_csv('social_media_aim.csv')
train_data = pd.read_csv('social_media_train.csv')

train_data = train_data.drop(columns=['Unnamed: 0'], errors='ignore')

X = train_data.drop(columns='fake')
y = train_data['fake']

label_encoders = {}
categorical_cols = ['profile_pic', 'sim_name_username', 'extern_url', 'private']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:

def predict_fake_account(input_data):
    input_df = pd.DataFrame(input_data)

    loaded_model = joblib.load('fake_account_classifier.pkl')
    loaded_label_encoders = joblib.load('label_encoders.pkl')

    for col in categorical_cols:
        input_df[col] = loaded_label_encoders[col].transform(input_df[col])

    feature_columns = ['profile_pic', 'ratio_numlen_username', 'len_fullname', 
                       'ratio_numlen_fullname', 'sim_name_username', 
                       'len_desc', 'extern_url', 'private', 
                       'num_posts', 'num_followers', 'num_following']
    
    input_df = input_df[feature_columns]

    prediction = loaded_model.predict(input_df)
    return "The account is likely fake." if prediction[0] == 1 else "The account is likely real."


In [43]:

model = RandomForestClassifier()
model.fit(X_train, y_train)

joblib.dump(model, 'fake_account_classifier.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')


['label_encoders.pkl']