<a href="https://colab.research.google.com/github/Bobbyamity/SMTA_Topic_Modelling/blob/main/Social_Media_Topic_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Social Media and Text Analytics
Industry Assignment 1: Topic Modelling

### First we import all important libraries

In [1]:
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder

### Loading and inspecting the dataset

In [2]:
train= pd.read_csv("train.csv")
test= pd.read_csv("test.csv")
train.head(2)

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0


In [3]:
print(train.shape)
print(test.shape)

(20972, 9)
(8989, 3)


### Data Preprocessing

In [4]:
col = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
test = test.drop(['ID'],axis=1)

X = train.loc[:,['TITLE','ABSTRACT']]
y = train.loc[:,col]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(18874, 2) (2098, 2)
(18874, 6) (2098, 6)


In [6]:
y_test.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

In [7]:
y1 = np.array(y_train)
y2 = np.array(y_test)

### Removing Punctuations

In [8]:
X_train.replace('[^a-zA-Z]',' ', regex=True, inplace=True)
X_test.replace('[^a-zA-Z]',' ', regex=True, inplace=True)

test.replace('[^a-zA-Z]',' ', regex=True, inplace=True)

### Converting to lower case characters

In [9]:
for index in X_train.columns:
  X_train[index] = X_train[index].str.lower()

for index in X_test.columns:
  X_test[index] = X_test[index].str.lower()

for index in test.columns:
  test[index] = test[index].str.lower()

### Removing multiple blank spaces

In [10]:
X_train = X_train.replace(r's+', ' ', regex=True)
X_test = X_test.replace(r's+', ' ', regex=True)

test = test.replace(r's+', ' ', regex=True)

### Feature Engineering

In [11]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back to form a preprocessed text
    processed_text = ' '.join(tokens)

    return processed_text

def convert_to_lines(data):
    lines = []
    for row in range(data.shape[0]):
        lines.append(' '.join(str(x) for x in data.iloc[row, :]))
    return lines
stop_words = set(stopwords.words('english'))
X_train['combined'] = X_train['TITLE']+' '+X_train['ABSTRACT']
X_test['combined'] = X_test['TITLE']+' '+X_test['ABSTRACT']

test['combined'] = test['TITLE']+' '+test['ABSTRACT']

X_train = X_train.drop(['TITLE','ABSTRACT'],axis=1)
X_test = X_test.drop(['TITLE','ABSTRACT'],axis=1)

test = test.drop(['TITLE','ABSTRACT'],axis=1)

X_train.head()
X_lines = []
for row in range(0,X.shape[0]):
  X_lines.append(' '.join(str(x) for x in X.iloc[row,:]))

train_lines = []
for row in range(0,X_train.shape[0]):
  train_lines.append(' '.join(str(x) for x in X_train.iloc[row,:]))

test_lines = []
for row in range(0,X_test.shape[0]):
  test_lines.append(' '.join(str(x) for x in X_test.iloc[row,:]))

predtest_lines = []
for row in range(0,test.shape[0]):
  predtest_lines.append(' '.join(str(x) for x in test.iloc[row,:]))
from sklearn.feature_extraction.text import CountVectorizer

countvector = CountVectorizer(ngram_range=(1,2))
X_train_cv = countvector.fit_transform(train_lines)
X_test_cv = countvector.transform(test_lines)

test_cv = countvector.transform(predtest_lines)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### Using CountVectorizer and TfidfVectorize & Model Development


In [12]:
#Using TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

tfidfvector = TfidfTransformer()
X_train_tf = tfidfvector.fit_transform(X_train_cv)
X_test_tf = tfidfvector.fit_transform(X_test_cv)

test_tf = tfidfvector.fit_transform(test_cv)

X_cv = countvector.transform(X_lines)

X_tf = tfidfvector.fit_transform(X_cv) #x_tf,y

### Model Evaluation

In [13]:
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier

model = LinearSVC(C=0.5, class_weight='balanced', random_state=42)
models = MultiOutputClassifier(model)

models.fit(X_train_tf, y1)
preds = models.predict(X_test_tf)
preds
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Evaluating the model performance

In [14]:
#print(confusion_matrix(y2,preds))
print(classification_report(y2,preds))
print(accuracy_score(y2,preds))
predssv = models.predict(test_tf)
predssv
test = pd.read_csv('test.csv')

submit = pd.DataFrame({'ID': test.ID, 'Computer Science': predssv[:,0],'Physics':predssv[:,1],
                       'Mathematics':predssv[:,2],'Statistics':predssv[:,3],'Quantitative Biology':predssv[:,4],
                       'Quantitative Finance':predssv[:,5]})

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       853
           1       0.89      0.88      0.89       623
           2       0.83      0.83      0.83       580
           3       0.73      0.85      0.78       516
           4       0.49      0.40      0.44        58
           5       0.81      0.65      0.72        26

   micro avg       0.80      0.86      0.83      2656
   macro avg       0.76      0.75      0.75      2656
weighted avg       0.81      0.86      0.83      2656
 samples avg       0.84      0.88      0.84      2656

0.6601525262154433


### Submission and Model Persistence

In [15]:
submit.head()

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,20973,0,0,0,1,0,0
1,20974,0,1,0,0,0,0
2,20975,1,0,0,0,0,0
3,20976,0,1,0,0,0,0
4,20977,1,0,0,0,0,0


In [16]:
submit.to_csv('Bobby_Submission.csv', index=False)

### Creating submission file and saving trained models using pickle

In [17]:
import pickle
# Save the trained MultiOutputClassifier model to a file
with open('multi_output_classifier_model.pkl', 'wb') as file:
    pickle.dump(models, file)

In [18]:
# Loading the MultiOutputClassifier model
with open('multi_output_classifier_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [19]:
# Save the CountVectorizer
with open('countvectorizer.pkl', 'wb') as file:
    pickle.dump(countvector, file)

In [20]:
# Save the CountVectorizer
with open('countvectorizer.pkl', 'wb') as file:
    pickle.dump(countvector, file)

In [21]:
# Loading the CountVectorizer
with open('countvectorizer.pkl', 'rb') as file:
    loaded_countvectorizer = pickle.load(file)

In [22]:
# Save the TfidfTransformer
with open('tfidftransformer.pkl', 'wb') as file:
    pickle.dump(tfidfvector, file)

In [23]:
# Loading the TfidfTransformer
with open('tfidftransformer.pkl', 'rb') as file:
    loaded_tfidftransformer = pickle.load(file)