In [1]:
import pandas as pd
import tensorflow as tf
from nltk import word_tokenize
import numpy as np
from numpy.linalg import norm
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import keras
from keras.layers import Embedding, Dense
from keras.models import Sequential
import gensim.downloader as api
import pickle

2023-10-15 22:43:46.005027: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Loading data and Report data balance?
trainDf = pd.read_csv("FakeNews_train.csv", sep='	')  #FakeNews train dataset
testDf = pd.read_csv("FakeNews_test.csv", sep='	')      #FakeNews test dataset
nerDf = pd.read_csv("ner_dataset.csv", encoding='unicode_escape')
IDDf = pd.read_csv("FakeNews_id_to_lable_test.csv") 
analogiesDf = pd.read_csv("analogy-test.txt", sep=' ')
testDf = testDf.drop(['id'], axis='columns')


In [3]:
#data cleaning functions
class cleaning_func:
    def lower_case(self, text):
        lowerCaseWords = text.lower()
        return lowerCaseWords

    def word_tokenization(self, text):
        words = re.findall(r'\w+', text)
        return words
    
    def remove_characters(self, text):
        noSpecialChar = re.sub(r'[$%#&*@:"”“]', '', text)
        return noSpecialChar

    def remove_numbers(self, text):
        text_without_numbers = re.sub(r'\d+', '', text)
        return text_without_numbers

#converts string values of '0' and '1' in 'label' column to integers
    def string_to_ints(self, df):
        i = 0
        for eachValue in df:
            if (eachValue == '0'):
                df[i] = 0
            elif(eachValue == '1'):
                df[i] = 1
            elif(eachValue == 'label'):
                df[i] = 1
            i += 1
        return df

In [4]:
class setup:
    #building vocabulary dictionary and index vectors
    def index_and_vocab(self, df):
        sentences = df['text']
        wordDictionary = {0:'pad'}

        i = 1
        for eachSentence in sentences:
            for eachWord in eachSentence:
                if (eachWord not in wordDictionary.values()):           #checking if word is already stored in dictionary
                    wordDictionary[i] = eachWord                
                    i += 1
        
        keyList = list(wordDictionary.keys())                           #defining all keys in vocab dictionary in a list
        valList = list(wordDictionary.values())                         #defining all values in vocab dictionary in a list
        indexVector = {}

        k = 0
        for eachSentence in sentences:
            wordIndices = []
            for eachWord in eachSentence:                               #finding each word in the dataset then finding the associated index of the word in the vocab dictionary and storing it in an index vector
                position = valList.index(eachWord)
                wordIndices.append(keyList[position])
            indexVector[k] = wordIndices
            k += 1
    
        valList = list(indexVector.values())

        j = 0
        for eachIndex in valList:                                       #appending the zeros necessary to each indexVector if it is less than the maximum length
            if (len(eachIndex) > 10000):                                #and removing the last values to each vector if it is greater than the maximum length
                for _ in range(len(eachIndex) - 10000):
                    eachIndex.pop()
                indexVector[j] = eachIndex
            elif(len(eachIndex) < 10000):
                for _ in range(10000 - len(eachIndex)):
                    indexVector[j].append(0)
            j += 1
        return indexVector, wordDictionary

In [5]:
#data cleaning and preprocessing
clean = cleaning_func()
trainDf['text'] = trainDf['text'].apply(clean.lower_case)
#trainDf['text'] = trainDf['text'].apply(clean.remove_characters)
#trainDf['text'] = trainDf['text'].apply(clean.remove_numbers)
trainDf['text'] = trainDf['text'].apply(clean.word_tokenization)
trainDf['label'] = clean.string_to_ints(trainDf['label'])

testDf['text'] = testDf['text'].apply(clean.lower_case)
#testDf['text'] = testDf['text'].apply(clean.remove_characters)
#testDf['text'] = testDf['text'].apply(clean.remove_numbers)
testDf['text'] = testDf['text'].apply(clean.word_tokenization)

setup = setup()
indexVector, wordDictionary = setup.index_and_vocab(trainDf)
indexVectorTest, wordDictionaryTest = setup.index_and_vocab(testDf)

valList = list(indexVector.values())
maxVec = max([len(lst) for lst in indexVector.values()]) 

In [6]:
#creating train, validation and testing set for fake news dataset
XTrainDf, XValDf, YTrainDf, YValDf = train_test_split(np.array(list(indexVector.values())), np.array(trainDf['label']), test_size=0.1)
Xtest = np.array(list(indexVectorTest.values()))
Ytest = np.array(IDDf['label'])

XTrainDf = XTrainDf.astype(int)
YTrainDf = YTrainDf.astype(int)
XValDf = XValDf.astype(int)
YValDf = YValDf.astype(int)

vocabSize = len(wordDictionary)
maxIndexLength = max([len(lst) for lst in indexVector.values()])

In [None]:
#Task 1
#training model
embeddingMatrix = np.random.rand(vocabSize, 300)

model = keras.Sequential()
model.add(Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length=maxIndexLength, trainable=False))
model.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)))
model.summary()

model.add(Dense(25, 'relu'))
model.add(Dense(1, 'sigmoid'))
model.summary()

model.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy'])
model.fit(XTrainDf, YTrainDf, batch_size=64, epochs=30, verbose=2, validation_data=(XValDf, YValDf))

print("\nTesting Results\n")

#testing model
model.evaluate(Xtest, Ytest, batch_size=64, verbose=2)
yTrue = np.array(Ytest)
yPredProbabilites = model.predict(Xtest)
yPred = (yPredProbabilites > 0.5).astype(np.int64)

precision = precision_score(yTrue, yPred)
recall = recall_score(yTrue, yPred)
f1 = f1_score(yTrue, yPred)
confusionMatrix = confusion_matrix(yTrue, yPred)

print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"f1 Score: {f1}")
print(f"Confusion Matrix: {confusionMatrix}")

In [None]:
#Task 2a
#Training model
modelTwoA = keras.Sequential()
modelTwoA.add(Embedding(vocabSize, 300, weights=[embeddingMatrix], input_length=maxIndexLength, trainable=False))
modelTwoA.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)))
modelTwoA.summary()

modelTwoA.add(Dense(25, 'relu'))
modelTwoA.add(Dense(1, 'sigmoid'))
modelTwoA.summary()

modelTwoA.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy'])
modelTwoA.fit(XTrainDf, YTrainDf, batch_size=64, epochs=30, verbose=2, validation_data=(XValDf, YValDf))

print("\n2a Testing Results\n")

#testing model
modelTwoA.evaluate(Xtest, Ytest, batch_size=64, verbose=2)
yTrue = np.array(Ytest)
yPredProbabilites = modelTwoA.predict(Xtest)
yPred = (yPredProbabilites > 0.5).astype(np.int64)

precision = precision_score(yTrue, yPred)
recall = recall_score(yTrue, yPred)
f1 = f1_score(yTrue, yPred)
confusionMatrix = confusion_matrix(yTrue, yPred)

print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"f1 Score: {f1}")
print(f"Confusion Matrix: {confusionMatrix}")

In [None]:
#preTrainedEmbeddingMatrix = api.load('word2vec-google-news-300')

In [None]:
#only use the embeddings in my defined vocabulary and considering OOV case
'''
newPreTrainedEmbeddingMatrix = []
for word in list(wordDictionary.values()):
    if (word not in list(preTrainedEmbeddingMatrix.key_to_index)):
        newPreTrainedEmbeddingMatrix.append(np.random.rand(300))
    else:
        newPreTrainedEmbeddingMatrix.append(list(preTrainedEmbeddingMatrix[word]))
'''

'\nnewPreTrainedEmbeddingMatrix = []\nfor word in list(wordDictionary.values()):\n    if (word not in list(preTrainedEmbeddingMatrix.key_to_index)):\n        newPreTrainedEmbeddingMatrix.append(np.random.rand(300))\n    else:\n        newPreTrainedEmbeddingMatrix.append(list(preTrainedEmbeddingMatrix[word]))\n'

In [None]:
#pickle.dump(newPreTrainedEmbeddingMatrix, open('preTrainedEmbeddings.pkl', 'wb'))
myPreTrainedEmbeddings = pickle.load(open('./preTrainedEmbeddings.pkl', 'rb'))
myPreTrainedEmbeddings

array([[-0.33984375, -0.1796875 , -0.07421875, ..., -0.13574219,
        -0.56640625,  0.11962891],
       [ 0.03320312, -0.08984375, -0.29492188, ..., -0.12695312,
         0.16113281, -0.16015625],
       [ 0.08007812,  0.10498047,  0.04980469, ...,  0.00366211,
         0.04760742, -0.06884766],
       ...,
       [-0.00601196, -0.30078125,  0.01080322, ..., -0.39648438,
         0.10742188, -0.06347656],
       [ 0.15625   , -0.18066406, -0.23339844, ...,  0.12792969,
         0.36914062,  0.01831055],
       [ 0.99802572,  0.89542166,  0.41218379, ...,  0.00684163,
         0.42792489,  0.02735533]])

In [None]:
#task 2b
#training model
#vocabSize = len(embeddingMatrix.index_to_key)
#newPreTrainedEmbeddingMatrix = np.array(newPreTrainedEmbeddingMatrix)
PreTrainedEmbeddingMatrix = myPreTrainedEmbeddings
vocabSize = len(PreTrainedEmbeddingMatrix)

modelTwoB = keras.Sequential()
modelTwoB.add(Embedding(vocabSize, 300, weights=[PreTrainedEmbeddingMatrix], input_length=maxIndexLength, trainable=True))
modelTwoB.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)))
modelTwoB.summary()

modelTwoB.add(Dense(25, 'relu'))
modelTwoB.add(Dense(1, 'sigmoid'))
modelTwoB.summary()

modelTwoB.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy'])
modelTwoB.fit(XTrainDf, YTrainDf, batch_size=64, epochs=30, verbose=2, validation_data=(XValDf, YValDf))

print("\n2b Testing Results\n")

#testing model
modelTwoB.evaluate(Xtest, Ytest, batch_size=64, verbose=2)
yTrue = np.array(Ytest)
yPredProbabilites = modelTwoB.predict(Xtest)
yPred = (yPredProbabilites > 0.5).astype(np.int64)

precision = precision_score(yTrue, yPred)
recall = recall_score(yTrue, yPred)
f1 = f1_score(yTrue, yPred)
confusionMatrix = confusion_matrix(yTrue, yPred)

print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"f1 Score: {f1}")
print(f"Confusion Matrix: {confusionMatrix}")

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10000, 300)        19198500  
                                                                 
 lambda_2 (Lambda)           (None, 300)               0         
                                                                 
Total params: 19198500 (73.24 MB)
Trainable params: 19198500 (73.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10000, 300)        19198500  
                                                                 
 lambda_2 (Lambda)           (None, 300)               0         
                                                        

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#task 2c
#training model
PreTrainedEmbeddingMatrix = myPreTrainedEmbeddings
vocabSize = len(PreTrainedEmbeddingMatrix)

modelTwoC = keras.Sequential()
modelTwoC.add(Embedding(vocabSize, 300, weights=[PreTrainedEmbeddingMatrix], input_length=maxIndexLength, trainable=False))
modelTwoC.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)))
modelTwoC.summary()

modelTwoC.add(Dense(25, 'relu'))
modelTwoC.add(Dense(1, 'sigmoid'))
modelTwoC.summary()

modelTwoC.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy'])
modelTwoC.fit(XTrainDf, YTrainDf, batch_size=64, epochs=30, verbose=2, validation_data=(XValDf, YValDf))

print("\n2b Testing Results\n")

#testing model
modelTwoC.evaluate(Xtest, Ytest, batch_size=64, verbose=2)
yTrue = np.array(Ytest)
yPredProbabilites = modelTwoC.predict(Xtest)
yPred = (yPredProbabilites > 0.42).astype(np.int64)

precision = precision_score(yTrue, yPred)
recall = recall_score(yTrue, yPred)
f1 = f1_score(yTrue, yPred)
confusionMatrix = confusion_matrix(yTrue, yPred)

print(f"Precision Score: {precision}")
print(f"Recall Score: {recall}")
print(f"f1 Score: {f1}")
print(f"Confusion Matrix: {confusionMatrix}")

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 10000, 300)        19198500  
                                                                 
 lambda_8 (Lambda)           (None, 300)               0         
                                                                 
Total params: 19198500 (73.24 MB)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 19198500 (73.24 MB)
_________________________________________________________________
Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 10000, 300)        19198500  
                                                                 
 lambda_8 (Lambda)           (None, 300)               0         
                                                        

In [None]:
#task 3
tf.keras.models.save_model(modelTwoB, '/Users/allen/Downloads/School/AI for NLP/Homework/HW 2')
loadedModel = tf.keras.models.load_model('/Users/allen/Downloads/School/AI for NLP/Homework/HW 2')
postTrainedEmbeddings = loadedModel.layers[0].get_weights()[0]

def swap_values_and_keys(dictionary):
    swappedDictionary = {values: keys for keys, values in dictionary.items()}
    return swappedDictionary

wordDic = swap_values_and_keys(wordDictionary)

wordEmbeddingA = {}
wordEmbeddingB = {}
wordEmbeddingC = {}

for word, wordVector in zip(wordDic, embeddingMatrix):
        wordEmbeddingA[word] = wordVector
        
for word, wordVector in zip(wordDic, postTrainedEmbeddings):
        wordEmbeddingB[word] = wordVector

for word, wordVector in zip(wordDic, PreTrainedEmbeddingMatrix):
        wordEmbeddingC[word] = wordVector



INFO:tensorflow:Assets written to: /Users/allen/Downloads/School/AI for NLP/Homework/HW 2/assets


INFO:tensorflow:Assets written to: /Users/allen/Downloads/School/AI for NLP/Homework/HW 2/assets


{'pad': array([0.38792322, 0.029546  , 0.80782495, 0.59344165, 0.65461713,
        0.68640619, 0.48607514, 0.90351476, 0.66359969, 0.44834619,
        0.68310151, 0.91412983, 0.31080058, 0.30707601, 0.510169  ,
        0.25352468, 0.26464624, 0.33558734, 0.94254938, 0.87244132,
        0.4321798 , 0.10677276, 0.39949675, 0.21627931, 0.6192446 ,
        0.17380353, 0.49411732, 0.88135409, 0.39743756, 0.29713564,
        0.07653037, 0.03677586, 0.86531036, 0.636912  , 0.41863208,
        0.49921541, 0.34645065, 0.07949345, 0.09647969, 0.5219243 ,
        0.15406641, 0.04745196, 0.12145564, 0.14896075, 0.32641885,
        0.11897317, 0.72280368, 0.04505672, 0.70508551, 0.6154043 ,
        0.26599896, 0.13394579, 0.21635578, 0.25547272, 0.39014311,
        0.58415111, 0.29271018, 0.45023173, 0.30927096, 0.99009875,
        0.42099162, 0.41071257, 0.04813121, 0.87176074, 0.37457744,
        0.78094425, 0.09099172, 0.73426576, 0.48683699, 0.84373089,
        0.59993779, 0.45956024, 0.5526336

In [None]:
#text preprocessing on analogy file
analogies = []

file = open("analogy-test.txt", 'r')
while True:
    lines = file.readline()
    if not lines:
        break
    lines = lines.strip()
    lines = lines.lower()
    words = lines.split()
    if (len(words) == 4):
        analogies.append(words)
file.close()

for _ in range(10):
    i=0
    for eachList in analogies:
        for eachWord in eachList:
            if (eachWord not in list(wordDic.keys())):
                analogies.pop(i)
        i+=1         


In [None]:
def nearest_neighbor(w1, w2, w3, wordEmbeddingMat):
    euclidean = lambda w1, w2: norm(w1 - w2)
    searchVec = np.linalg.norm(w3) + (np.linalg.norm(w2)-np.linalg.norm(w1))
    delta = sorted([euclidean(v, searchVec) for v in list(wordEmbeddingMat.values())])
    i = int(np.floor(delta[1]))
    bestMatch = list(wordEmbeddingMat.keys())[i]
    return bestMatch

def predict_analogies(analogies, wordEmbeddingMat):
    found = 0
    notFound = 0
    matchCount = 0
    mismatchCount = 0

    for eachList in analogies:
        for word in eachList:
            w1 = wordEmbeddingMat[word]
            w2 = wordEmbeddingMat[word]
            w3 = wordEmbeddingMat[word]
            w4 = word
            if (w1 is not None) and (w2 is not None) and (w3 is not None):
                found += 1
                bestMatch = nearest_neighbor(w1, w2, w3, wordEmbeddingMat)
                print(f"{eachList}: {bestMatch}")
                if bestMatch == w4:
                    matchCount += 1
                else:
                    mismatchCount += 1
            else:
                notFound += 1
    
    print(f"found - {found}: not found - {notFound}")
    print(f"match - {matchCount}: mismatch - {mismatchCount}")
    print(f"matching accuracy - {(matchCount*100)/found}")

print("Analogy predictions using embeddings for part 2a")
predict_analogies(analogies, wordEmbeddingA)

print("Analogy predictions using embeddings for part 2b")
predict_analogies(analogies, wordEmbeddingB)
    
print("Analogy predictions using embeddings for part 2c")
predict_analogies(analogies, wordEmbeddingC)


In [None]:
#part 4a
clean = cleaning_func()
#setup = setup()
nerInput = nerDf[['Sentence #', 'Word']]
nerOutput = nerDf['Tag']

sentenceList = []
for sentence in nerDf['Sentence #']:
    wordList = []
    for word in nerDf['Word']:
        wordList.append(word)
    sentenceList.append(wordList)

xTrain, xTest, yTrain, yTest = train_test_split(nerInput, nerOutput, test_size=0.2)



sentenceList