<a href="https://colab.research.google.com/github/AK-Github-0/NLP-Lab-Final/blob/main/2020024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing Libraries and Modules

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import re
import string 
import nltk
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from tqdm.auto import tqdm

In [13]:
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


### Defining all functions

In [50]:


def load_dataset(path):
  df = pd.read_csv(path)
  df = df[:100]
  return df



In [None]:


def EDA(df):
  plt.figure(figsize = (12,6))
  sns.histplot(df['user_verified'])
  plt.title('Account Distribution',fontsize = 20)
  plt.savefig('user_verifiedornot.png')


  data_=df['source'].value_counts().reset_index()

  trace1=go.Bar(x=['Twitter Web App', 'Twitter for iPhone', 'Twitter for Android','LinkedIn', 'HubSpot', 'IFTTT', 
                 'Hypefury', 'Sprout Social','Revive Social App', 'Buffer'],
              y=data_['source'],marker=dict(color='rgb(250,13,92)',
              line=dict(color='rgb(0,0,0)',width=1.5)),text=data_['source'],textposition='outside')

  layout=go.Layout(template='plotly_dark',title='Top 10 Most Source Disrtibution Of Tweets',xaxis=dict(title='Source'),
                 yaxis=dict(title='Count'),height=700)
  fig=go.Figure(data=[trace1],layout=layout)
  fig.write_image("Source_distribution.jpeg")

  data_verified=df[df['user_verified']==True].reset_index()
  data_not_verified=df[df['user_verified']==False].reset_index()

  all_hashtags=[]
  for i in range(len(data_verified['hashtags'])):
      a=data_verified['hashtags'][i].strip('][').split(', ')
      for i in a:
          all_hashtags.append(i)
  all_hashtags=pd.Series(np.array(all_hashtags))
  common_hashtags=all_hashtags.value_counts()[:30].rename_axis('common hashtags').reset_index(name='count')
  fig=px.treemap(common_hashtags,path=['common hashtags'],values='count',title='30 Most common hashtags by Verified Accounts')
  fig.write_image("Common_hashtags_by_verified.jpeg")

  all_hashtags=[]
  for i in range(len(data_verified['hashtags'])):
      a=data_not_verified['hashtags'][i].strip('][').split(', ')
      for i in a:
          all_hashtags.append(i)
  all_hashtags=pd.Series(np.array(all_hashtags))
  common_hashtags=all_hashtags.value_counts()[:30].rename_axis('common hashtags').reset_index(name='count')
  fig=px.treemap(common_hashtags,path=['common hashtags'],values='count',title='30 Most common hashtags by unverified Accounts')
  fig.write_image("Common_hashtags_by_unverified.jpeg")

  data_=data_verified['source'].value_counts().reset_index()
  trace1=go.Bar(x=['Twitter Web App', 'Twitter for iPhone', 'Twitter for Android','LinkedIn', 'HubSpot', 'IFTTT', 
                  'Hypefury', 'Sprout Social','Revive Social App', 'Buffer',],y=data_['source'],
              marker=dict(color='rgb(250,13,92)',line=dict(color='rgb(0,0,0)',width=1.5)),text=data_['source'],
              textposition='outside')
  layout=go.Layout(template='plotly_dark',title='Top 20 Most Source Distribution of Tweets From Verified Accounts',xaxis=dict(title='Source'),
                  yaxis=dict(title='Count'),height=650)
  fig=go.Figure(data=[trace1],layout=layout)
  fig.write_image("Source_distributions_from_verified.jpeg")



In [52]:

def remove_line_breaks(text):
    text = text.replace('\r', ' ').replace('\n', ' ')
    return text


def remove_punctuation(text):
    re_replacements = re.compile("__[A-Z]+__")  # such as __NAME__, __LINK__
    re_punctuation = re.compile("[%s]" % re.escape(string.punctuation))
    '''Escape all the characters in pattern except ASCII letters and numbers'''
    tokens = word_tokenize(text)
    tokens_zero_punctuation = []
    for token in tokens:
        if not re_replacements.match(token):
            token = re_punctuation.sub(" ", token)
        tokens_zero_punctuation.append(token)
    return ' '.join(tokens_zero_punctuation)


def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text


def lowercase(text):
    text_low = [token.lower() for token in word_tokenize(text)]
    return ' '.join(text_low)


def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    word_tokens = nltk.word_tokenize(text)
    text = " ".join([word for word in word_tokens if word not in stop])
    return text


def remove_one_character_words(text):
    '''Remove words from dataset that contain only 1 character'''
    text_high_use = [token for token in word_tokenize(text) if len(token)>1]      
    return ' '.join(text_high_use)   
    

def stem(text):
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    text_stemmed = [stemmer.stem(token) for token in word_tokenize(text)]        
    return ' '.join(text_stemmed)

def lemma(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = nltk.word_tokenize(text)
    text_lemma = " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokens])       
    return ' '.join(text_lemma)

def sentence_word(text):
    word_tokens = nltk.word_tokenize(text)
    return word_tokens

def paragraph_sentence(text):
    sent_token = nltk.sent_tokenize(text)
    return sent_token    

def tokenize(text):
    """Return a list of words in a text."""
    return re.findall(r'\w+', text)

def remove_numbers(text):
    no_nums = re.sub(r'\d+', '', text)
    return ''.join(no_nums)

def clean_text(text):
    _steps = [
    remove_line_breaks,
    remove_one_character_words,
    remove_special_characters,
    lowercase,
    remove_punctuation,
    remove_stopwords,
    stem,
    remove_numbers
]
    for step in _steps:
        text=step(text)
    return text   



In [77]:
text = ' '

In [78]:
for x in df['text']:
  text = text + x

In [82]:
text

' #GPT4 for FREE. \nNo its not a clickbait, @Qolaba Studio now has a chatbot powered by #ChatGPT4 offering #ChatGPTplus for FREE to use along with #AIart generator which allows you a 1-click #NFT #minting to make the most of your #AIArtwork \n#NFTs #OpenAI #OpenAIChatGPT  #chatgptAI enthusiastChatGPT Thinks These 5 Crypto Coins Will Explode This Year - Yahoo Finance #chatgpt #AI #openAI https://t.co/tFOdU0qnDzNew: @JWVance\'s post about 5 #startups (including @VcinityInc) that won the 1st #startup50 challenge (thx to #ChatGPT), based on answers about macro-economic pressures, VC funding, track record of management team &amp; growth potential of the market sector.  https://t.co/JXG37AMjh5🚨Get Out!🚨\n💰#Binance Spot💰\n⬇ Recommendation: #Short 🔴\nTicker:  #GHSTUSDT\nTime Interval:  5min\nLast Price: 0.956\n🔴 RSI: 88.3\n\nPowered by #ChatGPT\n\n$GHST\n#GHST\n#AlphaQuarkToken\n#저스트\n\nWhat are you gonna do?\n👇LONG        👇WAIT         👇SHORT https://t.co/QIqgtT11AaCaught in the web of the di

In [83]:
len(text)

21519

In [95]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [96]:
def DF2TXT(df):
  text = ' '
  for x in df['text']:
    text = text + x
  return text

In [97]:
def NGramAnalysis(text):
  word_tokens = word_tokenize(text)
  stop_words = list(stopwords.words('english'))
  clean_word_data = [w for w in word_tokens if not w.lower() in stop_words]
  bigrams_list = ["_".join(item) for item in nltk.bigrams(clean_word_data)]
  print(bigrams_list)
  trigrams_list = ["_".join(item) for item in nltk.trigrams(clean_word_data)]
  print(trigrams_list)
  

['#_GPT4', 'GPT4_FREE', 'FREE_.', '._clickbait', 'clickbait_,', ',_@', '@_Qolaba', 'Qolaba_Studio', 'Studio_chatbot', 'chatbot_powered', 'powered_#', '#_ChatGPT4', 'ChatGPT4_offering', 'offering_#', '#_ChatGPTplus', 'ChatGPTplus_FREE', 'FREE_use', 'use_along', 'along_#', '#_AIart', 'AIart_generator', 'generator_allows', 'allows_1-click', '1-click_#', '#_NFT', 'NFT_#', '#_minting', 'minting_make', 'make_#', '#_AIArtwork', 'AIArtwork_#', '#_NFTs', 'NFTs_#', '#_OpenAI', 'OpenAI_#', '#_OpenAIChatGPT', 'OpenAIChatGPT_#', '#_chatgptAI', 'chatgptAI_enthusiastChatGPT', 'enthusiastChatGPT_Thinks', 'Thinks_5', '5_Crypto', 'Crypto_Coins', 'Coins_Explode', 'Explode_Year', 'Year_-', '-_Yahoo', 'Yahoo_Finance', 'Finance_#', '#_chatgpt', 'chatgpt_#', '#_AI', 'AI_#', '#_openAI', 'openAI_https', 'https_:', ':_//t.co/tFOdU0qnDzNew', '//t.co/tFOdU0qnDzNew_:', ':_@', '@_JWVance', "JWVance_'s", "'s_post", 'post_5', '5_#', '#_startups', 'startups_(', '(_including', 'including_@', '@_VcinityInc', 'VcinityInc