<a href="https://colab.research.google.com/github/Ahmed-Shatla/NLP-data-cleaning/blob/main/Cleaning_tweets_with_applying_LogisticRegression%20(NLP)%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk

#downloading

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

#Reading data

In [3]:
from nltk.corpus import twitter_samples
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

#data cleaning

In [4]:
import re

In [5]:
def cleanText(text):
  text= re.sub(r'#[\S|_]*','',text) #hashtage removing
  text= re.sub(r'@[\S|_]*','',text) #username removing
  text= re.sub(r'https?:\/\/\S+','',text) #hyperlink removing
  text = re.sub(r'\W',' ',text) #remove any emotions
  text = re.sub(r'\d+','',text) #remove any standalone digits
  text = re.sub(r'^\s+','',text)#remove spaces that at start of the sentences
  text = re.sub(r'\s+$','',text)#remove spaces that at end of the sentences

  return text

In [6]:
cleanText(pos_tweets[0])

'for being top engaged members in my community this week'

In [7]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.stem import PorterStemmer
def process_on_tweets(tweets):
  result=[]
  for tweet in tweets:
    tweet = cleanText(tweet)
    tweet = tweet.split()
    tweet=[word for word in tweet if word.lower() not in stop_words]

    ps = PorterStemmer()
    tweet=[ps.stem(word) for word in tweet]

    result.append(tweet)
  return result

In [9]:
positive_tweets = process_on_tweets(pos_tweets)
negative_tweets = process_on_tweets(neg_tweets)

#Frequinces
### (word,label):freq

In [10]:
def build_freq(tweets,label):
  freq ={}
  for tweet in tweets:
    for word in tweet:
      key = (word,label)
      if key in freq:
        freq[key]+=1
      else:
        freq[key]=1

  return freq

In [11]:
pos_freq = build_freq(positive_tweets,1)
neg_freq = build_freq(negative_tweets,0)

#building features

In [12]:
def build_features(tweets,p_freq,n_freq):
  features=[]
  temp_vec = []

  for tweet in tweets:
    temp_vec=[0]*3   #[0,0,0]
    p_count = 0
    n_count = 0
    temp_vec[0]=1   #bias = 1
    for word in tweet:
      p_count+=p_freq.get((word,1),1)
      n_count+=n_freq.get((word,0),0)

    temp_vec[1]=p_count
    temp_vec[2]=n_count


    features.append(temp_vec)
  return features

In [13]:
pos_features = build_features(positive_tweets,pos_freq,neg_freq)
neg_features = build_features(negative_tweets,pos_freq,neg_freq)

In [14]:
pos_features[0]

[1, 173, 73]

In [15]:
len(pos_features)

5000

In [16]:
import numpy as np
pos_y = np.ones((len(pos_features),1))
neg_y = np.zeros((len(neg_features),1))

In [17]:
X=[]
y = []
for i in range(len(pos_features)):
  X.append(pos_features[i])
  X.append(neg_features[i])
  y.append(pos_y[i][0])
  y.append(neg_y[i][0])


In [18]:
import pandas as pd
data = pd.DataFrame(data=X,columns=['Bias','Feature 1','Feature 2'])
data['Label'] = y
data.head()

Unnamed: 0,Bias,Feature 1,Feature 2,Label
0,1,173,73,1.0
1,1,2,5,0.0
2,1,918,505,1.0
3,1,76,130,0.0
4,1,194,159,1.0


In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data.iloc[:,:-1].values,data.iloc[:,-1].values)

In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

In [21]:
y_pred = model.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_test))

0.7636
