In [1]:
#Imports
import re
import os
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd 
import sklearn
import tensorflow
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import layers, utils, callbacks, optimizers, regularizers
from tensorflow.keras.layers import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Mount and access drive
drive.mount('/content/drive',force_remount=True)
os.chdir('/content/drive/My Drive/Few-Shot-Tamil')
print("Change successful.")

Mounted at /content/drive
Change successful.


In [4]:
def tokenize(data):
  generated_token = list(data.split())
  return generated_token

In [5]:
ps = nltk.PorterStemmer()
def stemming(list_of_words):
  stemmed_list = [ps.stem(word) for word in list_of_words]
  return stemmed_list

In [6]:
#Apply TF-IDF
def tfidf(data):
    tfidf_vectorizer = TfidfVectorizer()
    train = tfidf_vectorizer.fit_transform(data)
    return train, tfidf_vectorizer

In [7]:
def tamil_preprocessing(dataframe):
  #Remove punctuation symbols
  dataframe['non_punc'] = dataframe['News'].apply(lambda x: re.sub(r'[\'\"\.?!‘]+', '', x.lower()))

  #Tokenize the sentences
  dataframe['tamil_tokenized_text'] = dataframe['non_punc'].apply(lambda x: tokenize(x))

In [8]:
#English translation preprocessing
def eng_preprocessing(dataframe):
  #Remove non-alphanumeric symbols
  dataframe['only_alphanumeric'] = dataframe['English version'].apply(lambda x: re.sub(r'[^a-zA-Z ]+', '', x.lower()))

  #Remove stop-words
  dataframe['imp_text'] = dataframe['only_alphanumeric'].apply(lambda x : ' '.join([word for word in x.split() if not word in set(stopwords.words('english'))]))

  #Tokenize the sentences
  dataframe['tokenized_text'] = dataframe['imp_text'].apply(lambda x: tokenize(x))

  #Apply stemming
  dataframe['stemmed_text'] = dataframe['tokenized_text'].apply(lambda x: stemming(x))

In [9]:
#Load the datasets
one_shot_real = pd.read_csv('one-shot-real.csv',  sep='\t')
one_shot_fake = pd.read_csv('one-shot-fake.csv',  sep='\t')
three_shot_real = pd.read_csv('three-shot-real.csv',  sep='\t')
three_shot_fake = pd.read_csv('three-shot-fake.csv',  sep='\t')
five_shot_real = pd.read_csv('five-shot-real.csv',  sep='\t')
five_shot_fake = pd.read_csv('five-shot-fake.csv',  sep='\t')

In [10]:
#Combine the datasets
one_shot = pd.concat([one_shot_real, one_shot_fake])
three_shot = pd.concat([three_shot_real, three_shot_fake])
five_shot = pd.concat([five_shot_real, five_shot_fake])

In [11]:
one_shot['Authenticity'] = [0, 0, 0, 0, 0, 1, 1, 1 ,1, 1]
three_shot['Authenticity'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1 ,1, 1, 1, 1, 1 ,1, 1, 1, 1, 1 ,1, 1]
five_shot['Authenticity'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1 ,1, 1, 1, 1, 1 ,1, 1, 1, 1, 1 ,1, 1, 1, 1, 1 ,1, 1, 1, 1, 1 ,1, 1]

In [12]:
#Take 10% of the original dataset for testing
os.chdir('/content/drive/My Drive')
news_df = pd.read_csv(filepath_or_buffer='news.csv')
X = news_df['News']
y = news_df['Authenticity']
xtrain, xtest, ytrain, ytest = train_test_split(X, y, stratify=y, random_state=42, test_size=0.1,)

In [13]:
xtest.reset_index(inplace = True, drop = True)

### One-Shot dataset

In [14]:
tamil_preprocessing(one_shot)
eng_preprocessing(one_shot)
one_shot.head()

Unnamed: 0,English version,Label,News,Author,Date,URL,Authenticity,non_punc,tamil_tokenized_text,only_alphanumeric,imp_text,tokenized_text,stemmed_text
0,Actor Jayam Ravi has Corona,['entertainment'],நடிகர் ஜெயம் ரவிக்கு,தினத்தந்தி,22-10-2022 02:53,https://www.dailythanthi.com/Cinema/CinemaNews...,0,நடிகர் ஜெயம் ரவிக்கு,"[நடிகர், ஜெயம், ரவிக்கு]",actor jayam ravi has corona,actor jayam ravi corona,"[actor, jayam, ravi, corona]","[actor, jayam, ravi, corona]"
1,A football god about the 'hands of God',['miscellaneous'],‘கடவுளின் கை’களைப் பற்றிய கால்பந்துக் கடவுள்,ஆதி,01-Dec-20,https://www.hindutamil.in/news/supplements/ila...,0,கடவுளின் கை’களைப் பற்றிய கால்பந்துக் கடவுள்,"[கடவுளின், கை’களைப், பற்றிய, கால்பந்துக், கடவுள்]",a football god about the hands of god,football god hands god,"[football, god, hands, god]","[footbal, god, hand, god]"
2,Cameras mandatory in school buses: Tamil Nadu ...,['politics'],பள்ளி பேருந்துகளில் கேமரா கட்டாயம்: தமிழக அரச...,செய்திப்பிரிவு,21-Oct-22,https://www.hindutamil.in/news/tamilnadu/88576...,0,பள்ளி பேருந்துகளில் கேமரா கட்டாயம்: தமிழக அரச...,"[பள்ளி, பேருந்துகளில், கேமரா, கட்டாயம்:, தமிழக...",cameras mandatory in school buses tamil nadu g...,cameras mandatory school buses tamil nadu gove...,"[cameras, mandatory, school, buses, tamil, nad...","[camera, mandatori, school, buse, tamil, nadu,..."
3,36th National Games | 4 gold medals for Tamil ...,['sport'],36-வது தேசிய விளையாட்டு | ஒரே நாளில் தமிழகத்த...,செய்திப்பிரிவு,02-Oct-22,https://www.hindutamil.in/news/sports/877427-3...,0,36-வது தேசிய விளையாட்டு | ஒரே நாளில் தமிழகத்த...,"[36-வது, தேசிய, விளையாட்டு, |, ஒரே, நாளில், தம...",th national games gold medals for tamil nadu...,th national games gold medals tamil nadu one day,"[th, national, games, gold, medals, tamil, nad...","[th, nation, game, gold, medal, tamil, nadu, o..."
4,Introducing NoiseFit Core 2 Smartwatch: Amazin...,['tech'],நாய்ஸ்ஃபிட் கோர் 2 ஸ்மார்ட்வாட்ச் அறிமுகம்: 5...,செய்திப்பிரிவு,22-Aug-22,https://www.hindutamil.in/news/technology/8467...,0,நாய்ஸ்ஃபிட் கோர் 2 ஸ்மார்ட்வாட்ச் அறிமுகம்: 5...,"[நாய்ஸ்ஃபிட், கோர், 2, ஸ்மார்ட்வாட்ச், அறிமுகம...",introducing noisefit core smartwatch amazing ...,introducing noisefit core smartwatch amazing f...,"[introducing, noisefit, core, smartwatch, amaz...","[introduc, noisefit, core, smartwatch, amaz, f..."


In [15]:
one_shot.reset_index(inplace = True, drop = True)

In [16]:
one_shot['stemmed_text']

0                         [actor, jayam, ravi, corona]
1                            [footbal, god, hand, god]
2    [camera, mandatori, school, buse, tamil, nadu,...
3    [th, nation, game, gold, medal, tamil, nadu, o...
4    [introduc, noisefit, core, smartwatch, amaz, f...
5                  [say, daniel, radcliff, coronaviru]
6                                   [say, covid, viru]
7    [say, presid, donald, trump, said, peopl, die,...
8    [kieron, pollard, good, viral, kieron, pollard...
9    [pib, fact, check, link, g, radiat, second, wa...
Name: stemmed_text, dtype: object

In [17]:
one_shot['tamil_tokenized_text']

0                             [நடிகர், ஜெயம், ரவிக்கு]
1    [கடவுளின், கை’களைப், பற்றிய, கால்பந்துக், கடவுள்]
2    [பள்ளி, பேருந்துகளில், கேமரா, கட்டாயம்:, தமிழக...
3    [36-வது, தேசிய, விளையாட்டு, |, ஒரே, நாளில், தம...
4    [நாய்ஸ்ஃபிட், கோர், 2, ஸ்மார்ட்வாட்ச், அறிமுகம...
5    [டேனியல், ராட்கிளிஃப், கொரோனா, வைரஸால், பாதிக்...
6                  [covid-19, வைரஸ், இல்லை, என்கிறார்]
7    [அமெரிக்க, அதிபர், டொனால்ட், டிரம்ப்,, “இதுவரை...
8    [kieron, pollard, நல்லாதான், இருக்காரு:, viral...
9    [pib, fact, check:, 5, ஜி, கதிர்வீச்சுக்கும், ...
Name: tamil_tokenized_text, dtype: object

In [18]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch import nn
from torch.utils.data import DataLoader

In [20]:
#Load a pre-trained model
word_embedding_model = models.Transformer('roberta-base', max_seq_length=128)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model_1 = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=128, activation_function=nn.Tanh())
dense_model_2 = models.Dense(in_features=128, out_features=128, activation_function=nn.Tanh())
dense_model_3 = models.Dense(in_features=128, out_features=64, activation_function=nn.Tanh())
dense_model_4 = models.Dense(in_features=64, out_features=32, activation_function=nn.Tanh())

one_shot_model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model_1, dense_model_2, dense_model_3, dense_model_4])


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
#Sentences to train
english_version = one_shot['stemmed_text'].apply(lambda x: ' '.join(x))
tamil_version = one_shot['tamil_tokenized_text'].apply(lambda x: ' '.join(x))

In [22]:
train_examples_one_shot = [
    InputExample(texts=[english_version[0] , tamil_version[0]], label=1.0),
    InputExample(texts=[english_version[1] , tamil_version[1]], label=1.0),
    InputExample(texts=[english_version[2] , tamil_version[2]], label=1.0),
    InputExample(texts=[english_version[3] , tamil_version[3]], label=1.0),
    InputExample(texts=[english_version[4] , tamil_version[4]], label=1.0),
    InputExample(texts=[english_version[5] , tamil_version[5]], label=1.0),    
    InputExample(texts=[english_version[6] , tamil_version[6]], label=1.0),
    InputExample(texts=[english_version[7] , tamil_version[7]], label=1.0),
    InputExample(texts=[english_version[8] , tamil_version[8]], label=1.0),
    InputExample(texts=[english_version[9] , tamil_version[9]], label=1.0),
]


In [23]:
train_dataloader = DataLoader(train_examples_one_shot, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(one_shot_model)

In [24]:
one_shot_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
one_shot_x = [  one_shot_model.encode(tamil_version[i]) for i in range(len(tamil_version)) ]

In [26]:
x_test = [ one_shot_model.encode(xtest[i]) for i in range(len(xtest))]

In [27]:
#Run the machine learning models for one-shot
clf = LogisticRegression(C=1.0)
clf.fit(one_shot_x, one_shot['Authenticity'])

LogisticRegression()

In [28]:
predictions = clf.predict_proba(x_test)
predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]

In [29]:
accuracy = sklearn.metrics.accuracy_score(predictions,ytest)
accuracy

0.5793499043977055

In [30]:
#Support Vector Machine
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(one_shot_x, one_shot['Authenticity'])
predictions = clf.predict_proba(x_test)
predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]

accuracy = sklearn.metrics.accuracy_score(predictions,ytest)
accuracy

0.44550669216061184

In [31]:
from sklearn import svm
SVM = svm.NuSVC(max_iter=100, probability = True)
SVM.fit(one_shot_x, one_shot['Authenticity'])
predictions = SVM.predict(x_test)
predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]

accuracy = sklearn.metrics.accuracy_score(predictions,ytest)
accuracy

0.44550669216061184

In [32]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_x = scaler.fit_transform(one_shot_x)
scaled_test = scaler.transform(x_test)

In [33]:
#Naive Bayes
clf = MultinomialNB(alpha=1.0)
clf.fit(scaled_x, one_shot['Authenticity'])
predictions = clf.predict_proba(scaled_test)
predictions = [np.argmax(predictions[i]) for i in range(len(predictions))]

accuracy = sklearn.metrics.accuracy_score(predictions,ytest)
accuracy

0.51434034416826

In [34]:
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier(n_jobs=2,random_state=0,verbose=True)
rfclf.fit(one_shot_x, one_shot['Authenticity'])
predictions = rfclf.predict(x_test)
accuracy = sklearn.metrics.accuracy_score(predictions,ytest)
accuracy

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


0.5793499043977055