In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:

_wnl = nltk.WordNetLemmatizer()


def normalize_word(w):
    return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric

    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
    # Removes stopwords from a list of tokens
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def preprocess(headlines,bodies):
  n_headlines, n_bodies =[],[]
  for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
    clean_headline = clean(headline)
    clean_body = clean(body)
    clean_headline = get_tokenized_lemmas(clean_headline)
    clean_body = get_tokenized_lemmas(clean_body)
    clean_headline = remove_stopwords(clean_headline)
    clean_body = remove_stopwords(clean_body)
    n_headlines.append(headline)
    n_bodies.append(body)
  n_headlines_df=pd.DataFrame(n_headlines,columns=['Headline'])
  n_bodies_df=pd.DataFrame(n_bodies,columns=['Body'])
  return n_headlines_df['Headline'], n_bodies_df['Body']


In [None]:
def statistical_features(dataset_loc):
  df = pd.read_csv('gdrive/MyDrive/CS626/Project/Data/train_Set.csv')
  dataset = pd.read_csv(dataset_loc)
  stop_words_l=stopwords.words('english')
  headlines = dataset['Headline']
  bodies = dataset['Body']
  headlines,bodies = preprocess(headlines,bodies)
  df['Headline'], df['Body'] = preprocess(df['Headline'],df['Body'])
  headline_vectorizer = TfidfVectorizer()
  h1 = headline_vectorizer.fit(df['Headline'])
  h = h1.transform(headlines)
  body_vectorizer = TfidfVectorizer(max_features=10000-h.shape[1])
  b1 = body_vectorizer.fit(df['Body'])
  b = b1.transform(bodies)
  statistical_features = np.concatenate((np.array(h.toarray()),np.array(b.toarray())),axis = 1)
  return statistical_features

In [None]:
statistical_features_train = statistical_features('gdrive/MyDrive/CS626/Project/Data/train_Set.csv')

In [None]:
np.count_nonzero(statistical_features_train[500])

In [None]:
statistical_features_test = statistical_features('gdrive/MyDrive/CS626/Project/Data/test_Set.csv')

In [None]:
statistical_features_test.shape

In [None]:
np.save(arr=statistical_features_test,file='gdrive/MyDrive/CS626/Project/Data/test_statistical_features.npy')

In [None]:
np.save(arr=statistical_features_train,file='gdrive/MyDrive/CS626/Project/Data/train_statistical_features.npy')

In [None]:
np.count_nonzero(statistical_features_test[0])