In [4]:
from joblib import load
import numpy as np
import pandas as pd
import pickle

In [5]:
# load the processed Daily stat data set 
server_daily_stats = pd.read_csv('Server_daily_stat.csv')

In [6]:
# Load the RandomForest classifier and vectorizer from a file
classifier = load('scam_classifier.joblib')
vectorizer = load('scam_vectorizer.joblib')

In [11]:
# load the LSTM model set
model_set3 = load_dict('Forecast_Model_Set3.pkl')
model_set7 = load_dict('Forecast_Model_Set7.pkl')

2024-03-13 08:49:57.694251: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:

def predict_scam(text):
    """ Predict scam alert on new data using the RandomForest classifier"""
    text_vector = vectorizer.transform([text])
    prediction = classifier.predict(text_vector)
    return "Possible Scam Detected" if prediction[0] == "Possible Scam Detected" else "Safe and Sound"

def load_dict(file_path):
    """
    Load a dictionary from a pickle file.

    Args:
    file_path (str): The path to the pickle file to be loaded.

    Returns:
    dict: The dictionary loaded from the pickle file.
    """
    with open(file_path, 'rb') as file:  # Open the file in binary read mode
        loaded_dict = pickle.load(file)  # Load the dictionary from the file
    return loaded_dict

def pred_next_discord_stat(time_steps,input_df,t, selected_model):
    """
    Predict next day discord stats data

    Args:
    input_df (df) is dataframe of past daily stats (see server_daily_stats for example)
    t is target property
    times_steps is determined by model used

    Returns:
    Float, predicted value for next day  data of a given target properties
    """
    
    input_data = input_df[t]
    data = input_data.values
    
    # Ensure data is a 2D array for scaling
    scaled_input = data.reshape(-1, 1)
    
    # Calculate the number of samples
    samples = len(data) - time_steps
    
    # Reformat for input
    X = np.array([scaled_input[i:(i + time_steps), 0] for i in range(samples)])
    
    if X.size > 0:
        X = X.reshape(X.shape[0], time_steps, 1)  # Reshape for LSTM input shape [samples, time steps, features]
        
        # Correctly define the last sequence
        # If you want to predict using the last available sequence from X
        if len(X) > 0:  # Ensure there is at least one sequence
            last_sequence = X[-1].reshape(1, time_steps, 1)  # This should have the shape [1, time_steps, 1]
            predicted_value = selected_model.predict(last_sequence, verbose=0)
    else:
        print("Not enough data to create a sequence.")
    return predicted_value[0][0]

In [24]:
def pred_next_day_df(input_df, time_steps,model_set):
    """Predict for each target in discord daily stats and return next day df"""
    target_list = ['price','n_author','n_channels','n_activities','n_words_ave','Sentiment_value']
    next_day_dict = {}
    for t in target_list:
        selected_model = model_set[t]
        next_day_val = pred_next_discord_stat(time_steps,input_df,t,selected_model)
        if t == 'n_author' or t =='n_channels' or t == 'n_activities':
            next_day_val = round(next_day_val)
        new_entry = {t:next_day_val}
        next_day_dict.update(new_entry)
    next_day_df = pd.DataFrame(next_day_dict,index = [0])
    return next_day_df

In [28]:
# Predict on next-day data using the LSTM-trained Model Set
input_df = server_daily_stats[800:810]  #
time_steps = 7 # or 3 depending on which model set is used
model_set = model_set7
next_day_df = pred_next_day_df(input_df, time_steps, model_set)

In [29]:
next_day_df 

Unnamed: 0,price,n_author,n_channels,n_activities,n_words_ave,Sentiment_value
0,0.457622,4,0,0,0.944736,0.254161


In [30]:
# Actual answer is the next day data from given input
server_daily_stats[810:811]

Unnamed: 0.1,Unnamed: 0,Year,Month,Day,price,n_author,n_channels,n_activities,n_words_ave,Sentiment_value
810,810,2022,2,24,0.446491,22,6,49,27.428571,0.260756
