In [None]:
# Import needed libraries
import pandas as pd
import numpy as np
from pandas import option_context
from sklearn.model_selection import train_test_split
#import Feature Extraction Methods
from sklearn.feature_extraction.text import CountVectorizer
#import classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
# import Evaluation Metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, f1_score,precision_score
import math
from sklearn.base import BaseEstimator, TransformerMixin
pd.options.display.float_format = "{:,.2f}".format

# import train and test set
train = pd.read_csv('train_set.csv')
train.columns = ['text','target']

test = pd.read_csv('test_set.csv')
test.columns = ['text','target']

In [None]:
# Step 0 : split dataset into Train_x , Train_y , Test_x , Test_y
train_x = train['text']
train_y = train['target']

test_x = test['text']
test_y = test['target']

In [None]:
#Step 1 : local Factor 1 (TF)
TF = CountVectorizer()

count_TF = TF.fit_transform(train_x)
featuresTF = TF.get_feature_names_out()
train_TF = pd.DataFrame(data = count_TF.toarray(),columns = featuresTF)

count_TFt = TF.transform(test_x)
featuresTFt = TF.get_feature_names_out ()
test_TF = pd.DataFrame(data = count_TFt.toarray(),columns = featuresTFt)

#TFTDA Transformer

In [None]:
# Step 2: TF-TDA
class TF_TDA_t(BaseEstimator, TransformerMixin):

    def __init__(self,K):
        self._K = K

    def fit(self, X, y):
   # find postive and negative tweets
        X['label']=y
        Positive_Training=X.loc[X['label'] == 1]
        del Positive_Training['label']
        Negative_Training=X.loc[X['label'] == 0]
        del Negative_Training['label']
        del X['label']
#-----------------------------------------------------#
   # find No. of training samples
        N = len(X)
#-----------------------------------------------------#
   # find a and c
        def FindA_C(Count,NoTweets,cls):
            for i in Count.columns:
                Count.loc[Count[i] > 0, i] = 1
            data = []
            for column in Count:
                x=Count[column].sum()
                y=x/NoTweets
                y1=y*100
                data.append((column,x,y1))
            ranking = pd.DataFrame(data, columns=['Term','DF','cls_freq_'+cls])
            tf=ranking.sort_values('cls_freq_'+cls, ascending=False)
            index_names = tf[ tf['DF'] == 0 ].index
            tf.drop(index_names, inplace = True)
            return tf

        a=FindA_C(Positive_Training,len(Positive_Training),'p')
        c=FindA_C(Negative_Training,len(Negative_Training),'n')
        self._a = a
        self._c = c

#-----------------------------------------------------#
        # find positive and negative percentages
        PosPer = round((len(Positive_Training)/N),2)
        #print(PosPer)
        self._PosPer = PosPer
        NegPer = round((len(Negative_Training)/N),2)
        #print(NegPer)
        self._NegPer = NegPer

#------------------Find T+, T-, and Com------------------#
        #find common terms between pos and neg (a and c)
        commonTermsTraining = a.merge(c, on=['Term'])
        commonTermsTraining.columns=['Term','DFP','cls_freq_p','DFN','cls_freq_n']
        AllCommon = [x for x in commonTermsTraining['Term']]
        self._AllCommon = AllCommon
        #print('All common ',len(AllCommon))

        # Find Pure Terms (T+, T-)
        PurePos = [x for x in a['Term'] if x not in AllCommon]
        #print('PurePos ',len(PurePos))
        PureNeg = [x1 for x1 in c['Term'] if x1 not in AllCommon]
        #print('PureNeg ',len(PureNeg))
        self._PureNeg = PureNeg
        self._PurePos = PurePos

        K = self._K
#------------------ Find Freq+, Freq-, and G ------------------#

        # find the variacne between cls_freq_p and cls_freq_n
        commonTermsTraining['Variance'] = commonTermsTraining['cls_freq_p']-commonTermsTraining['cls_freq_n']

        #common Negative (Freq-)
        commonTermsTrainingNegative = commonTermsTraining.loc[commonTermsTraining['Variance'] <=(-1*K)]
        CommonNegative=[x for x in commonTermsTrainingNegative.Term]
        self._CommonNegative=CommonNegative

        #common Positive (Freq+)
        commonTermsTrainingPositive = commonTermsTraining.loc[commonTermsTraining['Variance'] >=K]
        CommonPositive=[x for x in commonTermsTrainingPositive.Term]
        self._CommonPositive=CommonPositive

        # useful common terms (UsefulCommon)
        UsefulCommon=CommonPositive+CommonNegative
        self._UsefulCommon=UsefulCommon
        #find General terms
        G = [x for x in AllCommon if x not in UsefulCommon]
        self._G = G
        #print('G ',len(G))

        data = []
        data.append((K,len(CommonNegative),len(CommonPositive),len(G),len(PureNeg),len(PurePos)))
        data1 = pd.DataFrame(data, columns=['K','CommonNeg','CommonPos','G','PureNeg','PurePos'])
        #display(data1)
        return self

    def transform(self, X):
        G = self._G

        CommonNegative=self._CommonNegative
        CommonPositive=self._CommonPositive

        UsefulCommon=self._UsefulCommon
        AllCommon=self._AllCommon

        a = self._a
        c = self._c

        NegPer = self._NegPer
        PosPer = self._PosPer

        PureNeg = self._PureNeg
        PurePos = self._PurePos

        def method (cls_freq_a,cls_freq_c,x,cls_per):
            MAX = max(1,cls_freq_c)
            k=2+((cls_freq_a/MAX)*(x*cls_per))
            rf = math.log2(k)
            return rf

        for i in UsefulCommon:
            a1 = a.loc[a['Term'] == i, 'cls_freq_p'].item()
            c1 = c.loc[c['Term'] == i, 'cls_freq_n'].item()
            if i in CommonPositive:
                X[i] = X[i]*(method(a1,c1,1,PosPer))
            elif i in CommonNegative:
                X[i]= X[i]*(method(c1,a1,1,NegPer))

        for p in PurePos:
            a1 = a.loc[a['Term'] == p, 'cls_freq_p'].item()
            c1 = 0
            X[p]= X[p]*(method(a1,c1,2,PosPer))

        for n in PureNeg:
            c1 = c.loc[c['Term'] == n, 'cls_freq_n'].item()
            a1 = 0
            X[n]= X[n]*(method(c1,a1,2,NegPer))

        return X

# Modeling and Evaluation Results


In [None]:
results = pd.DataFrame()

# Classification Models
NB = MultinomialNB()
SVM = SVC(kernel = 'linear')

def Classification_Process(clf,method,t):
    test_x_Copy = test_TF.copy()
    train_x_Copy = train_TF.copy()
    Transformer = t
    train_TFIDF1 = Transformer.fit_transform(train_x_Copy,train_y)
    test_TFIDF1= Transformer.transform(test_x_Copy)
    clf.fit(train_TFIDF1,train_y)
    predM = clf.predict(test_TFIDF1)
    Precision = '{:.2%}'.format(precision_score(test_y, predM,average='weighted'))
    Recall = '{:.2%}'.format(recall_score(test_y, predM,average='weighted'))
    F_Score = '{:.2%}'.format(f1_score(test_y, predM,average='weighted'))
    accuracy = '{:.2%}'.format(accuracy_score(test_y, predM))
    results1=results.append({'clf':clf,'Method':method,'precision':Precision,'recall':Recall,'F_Score':F_Score,'Accuracy':accuracy},ignore_index=True)
    return results1

# Final Results of NB & SVM


In [None]:
# Note that x is the optimal k value which may differ based the dataset used and classification model
optimal_k = x
TF_TDA_NB = Classification_Process(NB,'TF_TDA',TF_TDA_t(optimal_k))
TF_TDA_SVM = Classification_Process(SVM,'TF_TDA',TF_TDA_t(optimal_k))

frames = [TF_TDA_NB,TF_TDA_SVM]
Final_result = pd.concat(frames)
display(Final_result)