# using Gensim Library

In [None]:
import gensim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [None]:
# combining the two separate csv files with fake and real data to a single dataframe 
# df1 --> Fake , df2 --> Real
df1 = pd.read_csv(input("Enter the file path for the fake dataset"))
df2 = pd.read_csv(input("Enter the file path for the real dataset"))

# adding the labels Fake --> 0 and Real --> 1
df1['target'] = 0
df2['target'] = 1

# combining the dataframes
combined_df = pd.concat([df1, df2], ignore_index=True)
# shuffling the indices
data = combined_df.sample(frac=1, random_state=42)
data.reset_index(inplace=True, drop=True)
print(data.head())

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.dropna(inplace=True)

In [None]:
data.target.value_counts()

 Since the number of Fake and True classes are almost same, there is no class imbalance

In [None]:
data.subject.value_counts()

In [None]:
# visualize the distribution of subjects
plt.hist(data['subject'], bins=len(data.subject.unique()), align = 'mid', edgecolor='black')
plt.xticks(rotation=90)
plt.xlabel('Subjects')
plt.ylabel('Frequency')
plt.show()

## Data Preprocessing

In [None]:
# Apply the function across the DataFrame
data['cleaned_text'] = data['text'].apply(lambda x: gensim.utils.simple_preprocess(x,max_len=20))

In [None]:
data.head()

In [None]:
# building the word2vec model based on the dataset
model = gensim.models.Word2Vec(
    window = 6,
    min_count = 1,
    workers = 4
)
model.build_vocab(data['cleaned_text'])

In [None]:
# training the model
model.train(data['cleaned_text'], total_examples=model.corpus_count, epochs=5)

# saving the model
model.save("word2vec/word2vec_model")

In [None]:
model.wv.index_to_key[:5]

In [None]:
len(model.wv.index_to_key)

In [None]:
# Function to calculate the average word vector for a sentence
def get_average_word2vec_vector(text, model, word_dim):
  vec = np.zeros((word_dim,))  
  count = 0
  for word in text:
    if word in model.wv:  
      vec += model.wv[word]
      count += 1
  if count != 0:
    vec /= count  
  return vec

# Get word dimensions from the model
word_dim = model.vector_size

# Apply the function to each preprocessed news text
word_vectors1 = [get_average_word2vec_vector(text, model, word_dim) for text in data['cleaned_text']]


In [None]:

# defining the wordvector for the google 
def word_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [None]:

data['word_vectors2'] = data['cleaned_text'].apply(word_vec)


In [None]:
data.head()

## Classifying the data

In [None]:
# importing necessary libraries for model building
from sklearn.model_selection import train_test_split

In [None]:
# defining X and y 
X = word_vectors1
X2 = data['word_vectors2'].values
y = data['target'].values

In [None]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size=0.33, random_state=44)

In [None]:
X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)
X_train2_2d = np.stack(X_train2)
X_test2_2d =  np.stack(X_test2)
X_train_2d.shape , X_test_2d.shape

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# creating a GradientBoosting model
clf = GradientBoostingClassifier()

# fit with all_train_embeddings and y_train
clf.fit(X_train_2d, y_train)


# get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(X_test_2d)


# print the classification report
print(classification_report(y_test, y_pred))

In [None]:
clf2 = GradientBoostingClassifier()
clf2.fit(X_train2_2d, y_train2)
y_pred2 = clf.predict(X_test2_2d)
print(classification_report(y_test2, y_pred2))