Step 1: Data Preprocessing
1. Load the CSV file:

In [83]:
import pandas as pd

# Load the data
data = pd.read_csv('Tweet_data.csv')

2. Clean 'Tweet text'.

In [84]:
import re

# Function to clean the tweet text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)   # Remove URLs
    text = re.sub(r'@\w+', '', text)      # Remove mentions
    text = re.sub(r'\d+', '', text)       # Remove numbers
    text = re.sub(r'#', '', text)         # Remove the hash # sign
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Clean the 'Tweet text'
data['cleaned_text'] = data['Tweet text'].apply(clean_text)


3. Convert 'Tweet text' to numerical format (encoding):

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Limit number of features to 100 for simplicity

# Fit and transform the cleaned text
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_text'])


4. Combine NLP features with numerical features: 

In [86]:

# Convert TF-IDF features to DataFrame
tfidf_features_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


# Combine with the rest of the numerical data, excluding non-numeric and target features
numerical_features = data.select_dtypes(include=['int64', 'float64'])
combined_features = pd.concat([tfidf_features_df, numerical_features], axis=1)


5. Scale the features.

In [87]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the features
scaled_features = scaler.fit_transform(combined_features)

6. Define the target variable 'virality'.

In [88]:
# Define virality based on the number of impressions
data['is_viral'] = data['impressions'] > 100000

Step 2: Build and Train the ANN

1. Adjust the input layer to match the number of features.

In [89]:
import tensorflow as tf

input_dim = scaled_features.shape[1]

# Initializing the ANN
ann_virality = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer
ann_virality.add(tf.keras.layers.Dense(units=16, activation='relu', input_dim=input_dim))

# Adding the output layer
ann_virality.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

2. Compile and Train the ANN

In [90]:
# Compiling the ANN
ann_virality.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, data['is_viral'], test_size=0.2, random_state=0
)

# Training the ANN on the Training set
ann_virality.fit(X_train, y_train, batch_size=32, epochs=100)


Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x7f082c92c5e0>

Step 3: Evaluate the Model
Evaluate the model performance with the testing dataset.

In [91]:
# Predicting the Test set results
y_pred = ann_virality.predict(X_test) > 0.5

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(cm)
print(f'Accuracy: {accuracy}')


[[155]]
Accuracy: 1.0


How to use the pretrained model on new tweets?

To predict the probability of a new tweet going viral using the trained model, we will need to preprocess the new tweet data in the same way as the training data was preprocessed.

In [92]:
def preprocess_tweet_text(new_tweet_text, tfidf_vectorizer, scaler):
    # Clean the new tweet text
    cleaned_text = clean_text(new_tweet_text)
    
    # Transform the text using the previously fitted TF-IDF vectorizer
    tfidf_features = tfidf_vectorizer.transform([cleaned_text])
    
    # Scale the TF-IDF features using the previously fitted scaler
    scaled_features = scaler.transform(tfidf_features.toarray())
    
    return scaled_features

# Predict the probability of a tweet being viral based on the text
def predict_virality(model, preprocessed_text):
    virality_probability = model.predict(preprocessed_text)
    return virality_probability

# Example usage:
new_tweet_text = "Your new tweet text here."
preprocessed_new_tweet = preprocess_tweet_text(new_tweet_text, tfidf_vectorizer, scaler)
virality_probability = predict_virality(ann_virality, preprocessed_new_tweet)
print(f"Probability of the tweet being viral: {virality_probability[0][0]:.2%}")




ValueError: X has 100 features, but StandardScaler is expecting 119 features as input.

Check the ratio of viral to non viral tweets in order to prevent overfitting: 

In [None]:
import numpy as np
# Output the number of viral tweets in the training set
viral_tweets_count = np.sum(y_train)
print(f"Number of viral tweets in the training set: {viral_tweets_count}")

# Check balance
balance_ratio = viral_tweets_count / len(y_train)
print(f"Balance ratio of viral to non-viral tweets in training set: {balance_ratio:.2%}")


Number of viral tweets in the training set: 3
Balance ratio of viral to non-viral tweets in training set: 0.49%


New Code: 



Preparing the Dataset for Training:

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Assuming 'data' contains your dataset and 'clean_text' is your text cleaning function

# Initialize and fit TF-IDF Vectorizer on cleaned tweet texts
tfidf_vectorizer = TfidfVectorizer(max_features=100)  # Adjust 'max_features' as needed
tfidf_features = tfidf_vectorizer.fit_transform(data['cleaned_text'].apply(clean_text))

# Scale TF-IDF features
scaler = StandardScaler().fit(tfidf_features.toarray())
scaled_tfidf_features = scaler.transform(tfidf_features.toarray())

# Define the target variable
data['is_viral'] = data['impressions'] > 100000  # Example criterion for virality

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    scaled_tfidf_features, 
    data['is_viral'], 
    test_size=0.2, 
    random_state=0
)

# Compute class weights for imbalance handling
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))


Adjusting the Neural Network Architecture

Note : Ensure the input layer of your ANN matches the number of features in your TF-IDF vectors.

In [96]:
# Initializing the ANN
ann_virality = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=16, activation='relu', input_dim=X_train.shape[1]),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

# Compiling the ANN
ann_virality.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the ANN with class weights
ann_virality.fit(
    X_train, 
    y_train, 
    batch_size=32, 
    epochs=100, 
    class_weight=class_weight_dict
)

TypeError: Value passed to parameter 'x' has DataType bool not in list of allowed values: bfloat16, float16, float32, float64, int8, int16, int32, int64, complex64, complex128

Plotting Test Accuracy and Loss Graphs

In [None]:
import matplotlib.pyplot as plt

# Plotting training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plotting training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()




NameError: name 'history' is not defined

Inference with the Trained Model

Note: Process the tweet text through the same cleaning, TF-IDF vectorization, and scaling steps before predicting with the trained model. This ensures the model predicts based on the tweet text alone.

In [None]:
def preprocess_and_predict(new_tweet_text, tfidf_vectorizer, scaler, model):
    # Clean and preprocess the new tweet text
    cleaned_text = clean_text(new_tweet_text)
    tfidf_features = tfidf_vectorizer.transform([cleaned_text])
    scaled_features = scaler.transform(tfidf_features.toarray())
    
    # Predict virality
    virality_probability = model.predict(scaled_features)
    return virality_probability

# Example usage
new_tweet_text = "Example tweet text here."
probability = preprocess_and_predict(new_tweet_text, tfidf_vectorizer, scaler, ann_virality)
print(f"Probability of being viral: {probability[0][0]:.2%}")
