In [1]:
import pandas as pd
from pathlib import Path
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional, Flatten, BatchNormalization
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import re
import numpy as np
import dill as pickle


In [4]:
def commentCleaner(comments):
    cleaned_comments = []
    for comment in comments:
        # Remove special symbols, emojis, reddit username mentions, and hyperlinks
        comment = re.sub(r"[^\w\s]|http\S+|www\S+|u/[A-Za-z0-9_-]+", "", comment)
        comment = comment.lower()
        # Tokenize the comment
        tokens = comment.split()
        # tokens = comment.split(' ')
        # Remove stop words
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        # Join the tokens back into a single string
        cleaned_comment = " ".join(tokens)
        cleaned_comments.append(cleaned_comment)   
    return cleaned_comments


    
def tokenizeComments(comments, tokenizer):
    # print("Comments recieved for tokenization: ")
    # print(comments)
    # print("Fitted tokenizer to sample texts")
    tokenized_comments = tokenizer.texts_to_sequences(comments)
    # print("Converted to sequences")
    tokenized_comments = pad_sequences(tokenized_comments, 235)
    # print("Padded succesfully")
    # print(tokenized_comments)
    return tokenized_comments

class textTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        # print("Starting fitting")
        return self
    
    def transform(self, X, y=None):
        # print("Starting transform")
        # print(X)
        # tokenizerFinal = Tokenizer(num_words=1000, split=' ') 
        # print(cleaned_data['Sentence'].values)
        # tokenizerFinal.fit_on_texts(cleaned_data['Sentence'].values)
        X_cleaned = commentCleaner(X)
        # print("Cleaned comments")
        # print("Starting tokenization")
        X_tokenized = tokenizeComments(X_cleaned, self.tokenizer)
        # print("Tokenized")
        # print("Ending transform")

        return X_tokenized

In [5]:
def load_pipeline_keras(cleaner, model, tokenizer):
    cleaner = pickle.load(open(cleaner,'rb'))
    tokenizerFinal = pickle.load(open(tokenizer,'rb'))
    model = keras.models.load_model(model)
    cleaner.tokenizer = tokenizerFinal
    # classifier = KerasClassifier(model=build_model, epochs=1, batch_size=10, verbose=1)
    # classifier.classes_ = pickle.load(open(folder_name+'/'+classes,'rb'))
    # classifier.model = build_model
    # build_model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return Pipeline([
        ('textTransformer', cleaner),
        ('model', model)
    ])

In [17]:
df = pd.read_json("server/Data/twitterData/dataset_easy-twitter-search-scraper_2024-07-23_17-55-15-178.json")
df.describe()
# df[0]
# df.columns

Unnamed: 0,id,url,verified,timestamp,text,links,isQuote,isRetweet,isReply,replyingTo,...,replies,retweets,quotes,searchQuery,user,images,quotedTweet,username,fullname,avatar
0,1815800545363517696,https://twitter.com/Hades51155014/status/18158...,False,2024-07-23 17:26:00+00:00,They all get self driving wrong. Building the ...,[],False,False,True,[/SawyerMerritt],...,0,0,0,Tesla,{'avatar': 'https://abs.twimg.com/sticky/defau...,,,,,
1,1815799687049535488,https://twitter.com/AKS5173/status/18157996870...,False,2024-07-23 17:22:00+00:00,No clue but I have a long position.\n\n$GOOGL ...,"[https://twitter.com/search?q=%23GOOGL, https:...",False,False,True,[/amitisinvesting],...,0,0,0,$meta,{'avatar': 'https://pbs.twimg.com/profile_imag...,,,,,
2,1815583310434992128,https://twitter.com/WR4NYGov/status/1815583310...,True,2024-07-23 03:02:00+00:00,For all the people justifying not buying a Tes...,[],False,True,False,,...,75,37,6,Tesla,{'avatar': 'https://pbs.twimg.com/profile_imag...,,,,,
3,1815800539181170944,https://twitter.com/teslaownersSV/status/18158...,True,2024-07-23 17:25:00+00:00,Reserved with intent 🪑\n\nTruth be told it was...,[],False,False,True,"[/rolotomars, /aaronjcash]",...,0,0,0,Tesla,{'avatar': 'https://pbs.twimg.com/profile_imag...,,,,,
4,1815800538425905408,https://twitter.com/Adamu436395/status/1815800...,False,2024-07-23 17:25:00+00:00,You are right history will judge,[],False,False,True,"[/nickfishwizard, /RealDanODowd, /Tesla, /Whol...",...,0,0,0,Tesla,{'avatar': 'https://pbs.twimg.com/profile_imag...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1203,1815800430519091200,https://twitter.com/janelleyy_/status/18158004...,False,2024-07-23 17:25:00+00:00,Imagine thinking raves started in London.. a b...,[],True,False,True,[/fabiiihartmusic],...,0,0,0,google,{'avatar': 'https://pbs.twimg.com/profile_imag...,,{'url': 'https://twitter.com/karsynscheirich/s...,,,
1204,1815791414174744832,https://twitter.com/WokeChallenger/status/1815...,False,2024-07-23 16:49:00+00:00,"This is hilarious…. She’s asking Alexa, a devi...",[],False,False,True,[/ronin19217435],...,0,0,0,airpods,{'avatar': 'https://pbs.twimg.com/profile_imag...,,,,,
1205,1815800415134572800,https://twitter.com/attwood21239/status/181580...,False,2024-07-23 17:25:00+00:00,Google search history of Nirmala Sitharaman.\n...,"[https://twitter.com/search?q=%23Budget2024, h...",False,False,False,,...,0,0,0,google,{'avatar': 'https://pbs.twimg.com/profile_imag...,,,@attwood21239,Brendan Attwood,https://pbs.twimg.com/profile_images/178812803...
1206,1815800391331971584,https://twitter.com/attwood21239/status/181580...,False,2024-07-23 17:25:00+00:00,Google search history of Nirmala Sitharaman.\n...,"[https://twitter.com/search?q=%23Budget2024, h...",False,False,False,,...,0,0,0,google,{'avatar': 'https://pbs.twimg.com/profile_imag...,,,@attwood21239,Brendan Attwood,https://pbs.twimg.com/profile_images/178812803...


In [24]:
columns_to_keep = ['timestamp', 'text', 'url', 'id']
df = df[columns_to_keep]

In [25]:
x = df['text']
cleaned_x = commentCleaner(x)
cleaned_x
df['text'] = cleaned_x
df

Unnamed: 0,timestamp,text,url,id
0,2024-07-23 17:26:00+00:00,get self driving wrong building car easy build...,https://twitter.com/Hades51155014/status/18158...,1815800545363517696
1,2024-07-23 17:22:00+00:00,clue long position googl leading big tech earn...,https://twitter.com/AKS5173/status/18157996870...,1815799687049535488
2,2024-07-23 03:02:00+00:00,people justifying buying tesla elons political...,https://twitter.com/WR4NYGov/status/1815583310...,1815583310434992128
3,2024-07-23 17:25:00+00:00,reserved intent truth told personal matter hop...,https://twitter.com/teslaownersSV/status/18158...,1815800539181170944
4,2024-07-23 17:25:00+00:00,right history judge,https://twitter.com/Adamu436395/status/1815800...,1815800538425905408
...,...,...,...,...
1203,2024-07-23 17:25:00+00:00,imagine thinking raf started london book googl...,https://twitter.com/janelleyy_/status/18158004...,1815800430519091200
1204,2024-07-23 16:49:00+00:00,hilarious shes asking alexa device listens eve...,https://twitter.com/WokeChallenger/status/1815...,1815791414174744832
1205,2024-07-23 17:25:00+00:00,google search history nirmala sitharaman budge...,https://twitter.com/attwood21239/status/181580...,1815800415134572800
1206,2024-07-23 17:25:00+00:00,google search history nirmala sitharaman budge...,https://twitter.com/attwood21239/status/181580...,1815800391331971584


In [None]:
df.to_csv('cleaned_twitter_data')