## Detect fake profiles in online social networks using Random Forest

In [None]:
pip install sklearn


In [54]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

###### function for reading dataset from csv files

def read_datasets():
    """Reads users profile from csv files"""
    genuine_users = pd.read_csv("data/users.csv")
    fake_users = pd.read_csv("data/fusers.csv")
    x = pd.concat([genuine_users, fake_users])
    y = len(fake_users) * [0] + len(genuine_users) * [1]
    return x, y

###### function for feature engineering

def extract_features(x):
    lang_list = list(enumerate(np.unique(x['lang'])))
    lang_dict = {name: i for i, name in lang_list}
    x.loc[:, 'lang_code'] = x['lang'].map(lambda x: lang_dict[x]).astype(int)
    
    feature_columns_to_use = ['statuses_count', 'followers_count', 'friends_count', 
                              'favourites_count', 'listed_count', 'lang_code']
    x = x.loc[:, feature_columns_to_use]
    return x

###### function for training model

def train_model(X_train, y_train, model_type='random_forest'):
    """Train the dataset using RandomForest or GradientBoosting."""
    
    if model_type == 'random_forest':
        clf = RandomForestClassifier(n_estimators=40, oob_score=True)
    elif model_type == 'gradient_boosting':
        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
    
    clf.fit(X_train, y_train)
    print(f"The trained model is: {clf}")
    return clf

###### Main flow of the program

print("Reading datasets...\n")
x, y = read_datasets()
x.describe()

print("Extracting features...\n")
x = extract_features(x)
print(x.columns)
print(x.describe())

print("Splitting datasets into train and validation...\n")
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.20, random_state=44)

print("Training the model...\n")
trained_model = train_model(X_train, y_train, model_type='random_forest')

# Now you can use this trained model in the `predict_n_rows` function to predict on input.csv


Reading datasets...

Extracting features...

Index(['statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'lang_code'],
      dtype='object')
       statuses_count  followers_count  friends_count  favourites_count  \
count     2818.000000      2818.000000    2818.000000       2818.000000   
mean      1672.198368       371.105039     395.363023        234.541164   
std       4884.669157      8022.631339     465.694322       1445.847248   
min          0.000000         0.000000       0.000000          0.000000   
25%         35.000000        17.000000     168.000000          0.000000   
50%         77.000000        26.000000     306.000000          0.000000   
75%       1087.750000       111.000000     519.000000         37.000000   
max      79876.000000    408372.000000   12773.000000      44349.000000   

       listed_count    lang_code  
count   2818.000000  2818.000000  
mean       2.818666     2.851313  
std       23.480430     1.992950  

In [57]:
import pandas as pd
import numpy as np
import csv

# Function to read n rows from input file
def read_n_rows(input_file, n):
    input_data = pd.read_csv(input_file, nrows=n)
    return input_data

# Function to predict for n rows and write to output.csv
def predict_n_rows(model, input_file, output_file, n):
    # Load the input data
    input_data = read_n_rows(input_file, n)
    
    # Extract features from input
    input_data = extract_features(input_data)
    
    # Make predictions
    predictions = model.predict(input_data)
    
    # Convert to class labels (0 or 1)
    predicted_classes = predictions
    
    # Save predictions to output.csv
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Prediction'])  # Add header
        for pred in predicted_classes:
            writer.writerow([pred])
    
    print(f"Predictions for {n} rows saved to {output_file}")

# Example usage
input_file = r"C:\Users\91875\Desktop\smart\Fake-SocialMedia-Detection\Datatransition\input.csv"
output_file = r"C:\Users\91875\Desktop\smart\Fake-SocialMedia-Detection\Datatransition\output.csv"
n = 1  # Specify the number of rows to predict
predict_n_rows(trained_model, input_file, output_file, n)


Predictions for 1 rows saved to C:\Users\91875\Desktop\smart\Fake-SocialMedia-Detection\Datatransition\output.csv
