## Detect fake profiles in online social networks using Support Vector Machine

In [5]:
pip install numpy==1.24


Collecting numpy==1.24Note: you may need to restart the kernel to use updated packages.

  Downloading numpy-1.24.0-cp311-cp311-win_amd64.whl.metadata (5.6 kB)
Downloading numpy-1.24.0-cp311-cp311-win_amd64.whl (14.8 MB)
   ---------------------------------------- 0.0/14.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.8 MB 220.2 kB/s eta 0:01:08
   ---------------------------------------- 0.0/14.8 MB 220.2 kB/s eta 0:01:08
   ---------------------------------------- 0.0/14.8 MB 220.2 kB/s eta 0:01:08
   ---------------------------------------- 0.1/14.8 MB 234.9 kB/s eta 0:01:03
   ---------------------------------------- 0.1/14.8 MB 306.3 kB/s eta 0:00:49
   ---------------------------------------- 0.1/14.8 MB 306.3 kB/s eta 0:00:49
   ---------------------------------------- 0.1/14.8 MB 238.8 kB/s eta 0:01:02
   -------------

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
seaborn 0.12.2 requires numpy!=1.24.0,>=1.17, but you have numpy 1.24.0 which is incompatible.


In [7]:
pip install pybind11>=2.12


Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.svm import SVC

# Function to read n rows from input file
def read_n_rows(input_file, n):
    """Reads n rows from input file."""
    input_data = pd.read_csv(input_file, nrows=n)
    return input_data

# Function to extract features
def extract_features(x):
    """Extract features from the dataset."""
    lang_list = list(enumerate(np.unique(x['lang'])))
    lang_dict = {name: i for i, name in lang_list}
    x['lang_code'] = x['lang'].map(lambda x: lang_dict[x]).astype(int)
    
    feature_columns_to_use = ['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'lang_code']
    x = x.loc[:, feature_columns_to_use]
    return x

# Function to predict n rows and write to output.csv
def predict_n_rows(model, input_file, output_file, n):
    """Predicts for n rows and writes to output.csv."""
    # Load the input data
    input_data = read_n_rows(input_file, n)
    
    # Extract features from input
    input_data = extract_features(input_data)
    
    # Scaling features
    input_data = preprocessing.scale(input_data)
    
    # Make predictions
    predictions = model.predict(input_data)
    
    # Save predictions to output.csv
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Prediction'])  # Add header
        for pred in predictions:
            writer.writerow([pred])
    
    print(f"Predictions for {n} rows saved to {output_file}")

# Function to train the model
def train(X_train, y_train):
    """Trains and returns an SVM classifier."""
    X_train = preprocessing.scale(X_train)
    
    Cs = 10.0 ** np.arange(-2, 3, .5)
    gammas = 10.0 ** np.arange(-2, 3, .5)
    param = [{'gamma': gammas, 'C': Cs}]
    classifier = SVC(probability=True)
    clf = GridSearchCV(classifier, param_grid=param, cv=5)
    clf.fit(X_train, y_train)
    print("The best classifier is: ", clf.best_estimator_)
    
    return clf.best_estimator_

# Main flow of the program
print("Reading datasets.....\n")
x, y = read_datasets()

print("Extracting features.....\n")
x = extract_features(x)
print(x.columns)
print(x.describe())

print("Splitting datasets into train and test datasets...\n")
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=44)

print("Training datasets.......\n")
trained_model = train(X_train, y_train)

# Predict n rows and save the predictions
input_file = r"C:\Users\91875\Desktop\smart\Fake-SocialMedia-Detection\Datatransition\input.csv"
output_file = r"C:\Users\91875\Desktop\smart\Fake-SocialMedia-Detection\Datatransition\output.csv"
n = 1  # Specify the number of rows to predict
predict_n_rows(trained_model, input_file, output_file, n)


Reading datasets.....

Extracting features.....

Index(['statuses_count', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'lang_code'],
      dtype='object')
       statuses_count  followers_count  friends_count  favourites_count  \
count     2818.000000      2818.000000    2818.000000       2818.000000   
mean      1672.198368       371.105039     395.363023        234.541164   
std       4884.669157      8022.631339     465.694322       1445.847248   
min          0.000000         0.000000       0.000000          0.000000   
25%         35.000000        17.000000     168.000000          0.000000   
50%         77.000000        26.000000     306.000000          0.000000   
75%       1087.750000       111.000000     519.000000         37.000000   
max      79876.000000    408372.000000   12773.000000      44349.000000   

       listed_count    lang_code  
count   2818.000000  2818.000000  
mean       2.818666     2.851313  
std       23.480430     1.9929