<a href="https://colab.research.google.com/github/AK-Github-0/NLP-Lab-Final/blob/main/2020024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing Libraries and Modules

In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import re
import string 
import nltk
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from tqdm.auto import tqdm
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim.models import Word2Vec
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Defining all functions

In [50]:


def load_dataset(path):
  df = pd.read_csv(path)
  df = df[:100]
  return df



In [None]:


def EDA(df):
  plt.figure(figsize = (12,6))
  sns.histplot(df['user_verified'])
  plt.title('Account Distribution',fontsize = 20)
  plt.savefig('user_verifiedornot.png')


  data_=df['source'].value_counts().reset_index()

  trace1=go.Bar(x=['Twitter Web App', 'Twitter for iPhone', 'Twitter for Android','LinkedIn', 'HubSpot', 'IFTTT', 
                 'Hypefury', 'Sprout Social','Revive Social App', 'Buffer'],
              y=data_['source'],marker=dict(color='rgb(250,13,92)',
              line=dict(color='rgb(0,0,0)',width=1.5)),text=data_['source'],textposition='outside')

  layout=go.Layout(template='plotly_dark',title='Top 10 Most Source Disrtibution Of Tweets',xaxis=dict(title='Source'),
                 yaxis=dict(title='Count'),height=700)
  fig=go.Figure(data=[trace1],layout=layout)
  fig.write_image("Source_distribution.jpeg")

  data_verified=df[df['user_verified']==True].reset_index()
  data_not_verified=df[df['user_verified']==False].reset_index()

  all_hashtags=[]
  for i in range(len(data_verified['hashtags'])):
      a=data_verified['hashtags'][i].strip('][').split(', ')
      for i in a:
          all_hashtags.append(i)
  all_hashtags=pd.Series(np.array(all_hashtags))
  common_hashtags=all_hashtags.value_counts()[:30].rename_axis('common hashtags').reset_index(name='count')
  fig=px.treemap(common_hashtags,path=['common hashtags'],values='count',title='30 Most common hashtags by Verified Accounts')
  fig.write_image("Common_hashtags_by_verified.jpeg")

  all_hashtags=[]
  for i in range(len(data_verified['hashtags'])):
      a=data_not_verified['hashtags'][i].strip('][').split(', ')
      for i in a:
          all_hashtags.append(i)
  all_hashtags=pd.Series(np.array(all_hashtags))
  common_hashtags=all_hashtags.value_counts()[:30].rename_axis('common hashtags').reset_index(name='count')
  fig=px.treemap(common_hashtags,path=['common hashtags'],values='count',title='30 Most common hashtags by unverified Accounts')
  fig.write_image("Common_hashtags_by_unverified.jpeg")

  data_=data_verified['source'].value_counts().reset_index()
  trace1=go.Bar(x=['Twitter Web App', 'Twitter for iPhone', 'Twitter for Android','LinkedIn', 'HubSpot', 'IFTTT', 
                  'Hypefury', 'Sprout Social','Revive Social App', 'Buffer',],y=data_['source'],
              marker=dict(color='rgb(250,13,92)',line=dict(color='rgb(0,0,0)',width=1.5)),text=data_['source'],
              textposition='outside')
  layout=go.Layout(template='plotly_dark',title='Top 20 Most Source Distribution of Tweets From Verified Accounts',xaxis=dict(title='Source'),
                  yaxis=dict(title='Count'),height=650)
  fig=go.Figure(data=[trace1],layout=layout)
  fig.write_image("Source_distributions_from_verified.jpeg")



In [126]:

def remove_line_breaks(text):
    text = text.replace('\r', ' ').replace('\n', ' ')
    return text


def remove_punctuation(text):
    re_replacements = re.compile("__[A-Z]+__")  # such as __NAME__, __LINK__
    re_punctuation = re.compile("[%s]" % re.escape(string.punctuation))
    '''Escape all the characters in pattern except ASCII letters and numbers'''
    tokens = word_tokenize(text)
    tokens_zero_punctuation = []
    for token in tokens:
        if not re_replacements.match(token):
            token = re_punctuation.sub(" ", token)
        tokens_zero_punctuation.append(token)
    return ' '.join(tokens_zero_punctuation)


def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text


def lowercase(text):
    text_low = [token.lower() for token in word_tokenize(text)]
    return ' '.join(text_low)


def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    word_tokens = nltk.word_tokenize(text)
    text = " ".join([word for word in word_tokens if word not in stop])
    return text


def remove_one_character_words(text):
    '''Remove words from dataset that contain only 1 character'''
    text_high_use = [token for token in word_tokenize(text) if len(token)>1]      
    return ' '.join(text_high_use)   
    

def stem(text):
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    text_stemmed = [stemmer.stem(token) for token in word_tokenize(text)]        
    return ' '.join(text_stemmed)

def lemma(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = nltk.word_tokenize(text)
    text_lemma = " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokens])       
    return ' '.join(text_lemma)

def sentence_word(text):
    word_tokens = nltk.word_tokenize(text)
    return word_tokens

def paragraph_sentence(text):
    sent_token = nltk.sent_tokenize(text)
    return sent_token    

def tokenize(text):
    """Return a list of words in a text."""
    return re.findall(r'\w+', text)

def remove_numbers(text):
    no_nums = re.sub(r'\d+', '', text)
    return ''.join(no_nums)

def clean_text(text):
    _steps = [
    remove_line_breaks,
    remove_one_character_words,
    remove_special_characters,
    lowercase,
    remove_punctuation,
    remove_stopwords,
    stem,
    remove_numbers
]
    for step in _steps:
        text=step(text)
    return text   



In [96]:
def DF2TXT(df):
  text = ' '
  for x in df['text']:
    text = text + x
  return text

In [119]:
def NGramAnalysis(text):
  word_tokens = word_tokenize(text)
  stop_words = list(stopwords.words('english'))
  clean_word_data = [w for w in word_tokens if not w.lower() in stop_words]
  bigrams_list = ["_".join(item) for item in nltk.bigrams(clean_word_data)]
  trigrams_list = ["_".join(item) for item in nltk.trigrams(clean_word_data)]
  bigram_counts = Counter(bigrams_list)
  trigram_counts = Counter(trigrams_list)
    

  top_10_bigrams = bigram_counts.most_common(10)
  top_10_trigrams = trigram_counts.most_common(10)


  bigram_labels, bigram_values = zip(*top_10_bigrams)
  trigram_labels, trigram_values = zip(*top_10_trigrams)


  plt.figure(figsize=(10, 5))
  plt.subplot(121)
  plt.barh(range(len(bigram_labels)), bigram_values, align='center')
  plt.yticks(range(len(bigram_labels)), bigram_labels)
  plt.xlabel('Count')
  plt.ylabel('Bigrams')
  plt.title('Top 10 Most Common Bigrams')

  plt.subplot(122)
  plt.barh(range(len(trigram_labels)), trigram_values, align='center')
  plt.yticks(range(len(trigram_labels)), trigram_labels)
  plt.xlabel('Count')
  plt.ylabel('Trigrams')
  plt.title('Top 10 Most Common Trigrams')

  plt.tight_layout()
  plt.savefig('TrigramsBigrams.png')


def show_wordcloud(df):
  text = " ".join(cat.split()[1] for cat in df.text)
  
  word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)

  plt.imshow(word_cloud, interpolation='bilinear')
  plt.axis("off")
  plt.savefig('WordCloud.png')

In [130]:
def get_textblob_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return 'Positive'
    elif sentiment < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [135]:
def sentimentanalysis(df):
  df['preprocessed_text'] = df['text'].apply(clean_text)
  df['textblob_sentiment'] = df['preprocessed_text'].apply(get_textblob_sentiment)


In [141]:


def Feature_Engineering(df):
  text = df['text']
  df['sentencelen'] = text.apply(lambda x: len(x.split('.')))
  df['countofwords'] = text.apply(lambda x: len(x.split()))
  df['spaces'] = text.apply(lambda x: x.count(' '))
  df['characters'] = text.apply(len)
  return df

In [147]:


def vectorization(df):
  count_vectorizer = CountVectorizer()
  tfidf_vectorizer = TfidfVectorizer()
  sentences = [text.split() for text in df['text']]
  w2v_model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)
  count_vectors = count_vectorizer.fit_transform(df['text'])
  tfidf_vectors = tfidf_vectorizer.fit_transform(df['text'])
  df['CountVect_df'] = count_vectors.toarray().tolist()
  df['TfIDF_df'] = tfidf_vectors.toarray().tolist()
  w2v_vectors = [sum([w2v_model.wv[word] for word in text.split() if word in w2v_model.wv])
                   for text in df['text']]
  df['Word2Vec_df'] = w2v_vectors
  return df

In [None]:
def model(df):
  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['textblob_sentiment'], test_size=0.3, random_state=42)
  # Vectorize the text data using TF-IDF
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
  X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
  X_test_tfidf = tfidf_vectorizer.transform(X_test)
  models = [
    ("LinearSVC", LinearSVC()),
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("MultinomialNB", MultinomialNB()),
    ("Random Forest", RandomForestClassifier(n_estimators=100)),
    ("Decision Tree", DecisionTreeClassifier())
  ]

In [None]:

def extract_features_countvect_textblob(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    return [text, sentiment]

df['features'] = df['preprocessed_text'].apply(extract_features_countvect_textblob)

X_train, X_test, y_train, y_test = train_test_split(df['features'], df['sentiment'], test_size=0.2, random_state=42)

count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train.apply(lambda x: x[0]))
X_test_counts = count_vect.transform(X_test.apply(lambda x: x[0]))

tfidf_vect = TfidfVectorizer()

X_train_tfidf = tfidf_vect.fit_transform(X_train.apply(lambda x: x[0]))
X_test_tfidf = tfidf_vect.transform(X_test.apply(lambda x: x[0]))

svc_model = LinearSVC()

svc_model.fit(X_train_counts, y_train)
y_pred = svc_model.predict(X_test_counts)
print("Results for CountVectorizer features:")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

svc_model.fit(X_train_tfidf, y_train)
y_pred = svc_model.predict(X_test_tfidf)
print("\nResults for TF-IDF features:")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

Main Program

In [136]:
sentimentanalysis(df)

In [138]:
df.head()

Unnamed: 0,user_name,text,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,hashtags,source,textblob_sentiment,preprocessed_text
0,Bohmle,"#GPT4 for FREE. \nNo its not a clickbait, @Qol...",Carkingga,,,,,,,,,,Positive,gpt free clickbait qolaba studio chatbot power...
1,,AI enthusiast,2019-07-03 03:44:41+00:00,60.0,349.0,611.0,False,2023-05-17 18:11:12+00:00,False,Twitter Web App,,,Neutral,ai enthusiast
2,Dan Bruno AI,ChatGPT Thinks These 5 Crypto Coins Will Explo...,"Manchester, NH","The latest in #ChatGPT, #BARD, #Bing, and othe...",2021-05-19 01:19:32+00:00,470.0,157.0,5185,False,2023-05-17 18:11:03+00:00,"['chatgpt', 'AI', 'openAI']",dlvr.it,Neutral,chatgpt think crypto coin explod year yahoo fi...
3,Georgiana Comsa,New: @JWVance's post about 5 #startups (includ...,Palo Alto,"Founder of Silicon Valley PR, award-winning PR...",2008-12-24 09:32:23+00:00,3864.0,1883.0,2415,False,2023-05-17 18:10:25+00:00,"['startups', 'startup50']",Twitter Web App,Negative,new jwvanc post startup includ vcinityinc st s...
4,Bitone Great,🚨Get Out!🚨\n💰#Binance Spot💰\n⬇ Recommendation:...,Hong Kong,#ChatGPT (AI) powered Free Trading Signal! \nL...,2022-11-21 04:42:18+00:00,1517.0,506.0,64,False,2023-05-17 18:09:39+00:00,"['Binance', 'Short', 'GHSTUSDT']",rsi1,Negative,get binanc spot recommend short ticker ghstusd...
