<a href="https://colab.research.google.com/github/ElhamHosseini73/LearnNLP/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
# you will be implementing logistic regression for sentiment analysis on tweets.
# Given a tweet, you will decide if it has a positive sentiment or a negative one

#step:
#Learn how to extract features for logistic regression given some text
#Implement logistic regression from scratch
#Apply logistic regression on a natural language processing task
#Test using your logistic regression
#Perform error analysis

In [None]:
import nltk
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples

In [71]:

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

data = all_positive_tweets +all_negative_tweets

lable_positive = [1]* len(all_positive_tweets)
lable_negative = [0]* len(all_negative_tweets)

lables = lable_positive + lable_negative

In [72]:
#Train test split: 20% will be in the test set, and 80% in the training set.
len_pos_data_train = int(len(all_positive_tweets)*0.8)
len_neg_data_train = int(len(all_negative_tweets)*0.8)

x_train = all_positive_tweets[0:len_pos_data_train] + all_negative_tweets[0:len_neg_data_train]
x_test =  all_positive_tweets[len_pos_data_train:] + all_negative_tweets[len_neg_data_train:]

lable_train = lable_positive[0:len_pos_data_train]+lable_negative[0:len_pos_data_train]
lable_test =  lable_positive[len_pos_data_train:]+lable_negative[len_pos_data_train:]

In [None]:
#preprocessing twitter

import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import string
from nltk.stem import PorterStemmer

stopwords_english = stopwords.words('english')

tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)


# Instantiate stemming class
stemmer = PorterStemmer()

def preprocessing_twitter(text):
  # remove old style retweet text "RT"
  text = re.sub(r'^RT[\s]+', '', text)

  # remove hyperlinks
  text = re.sub(r'https?://[^\s\n\r]+', '', text)

  # remove hashtags
  # only removing the hash # sign from the word
  text = re.sub(r'#', '', text)

  # tokenize tweets
  text_tokens = tokenizer.tokenize(text)

  text_clean = []

  for word in text_tokens: # Go through every word in your tokens list
    if (word not in stopwords_english and  # remove stopwords
        word not in string.punctuation):  # remove punctuation
        text_clean.append(word)

  text_stem = []

  for word in text_clean:
    stem_word = stemmer.stem(word)  # stemming word
    text_stem.append(stem_word)  # append to the list

  return text_stem



In [74]:

# this function will build a dictionary where we can lookup how many times a word appears in the lists of positive or negative tweets.

def build_freqs(tweets, lables):

  #   Input: tweets: a list of tweets && lables: an m x 1 array with the sentiment label of each tweet (either 0 or 1)
  #   Output: freqs: a dictionary mapping each (word, lable) pair to its frequency

  word_freq = {}

  for index , tweet in enumerate(tweets):
    process_tweet = preprocessing_twitter(tweet)

    for token in process_tweet:

      if (token,lables[index]) in word_freq:
        word_freq[token,lables[index]] = word_freq[token,lables[index]]+1
      else:
        word_freq[token,lables[index]] = 1

  return word_freq

In [75]:
word_freq = build_freqs(data,lables)

In [76]:
import numpy as np

#Extracting the features
#The first feature is the number of positive words in a tweet.
#The second feature is the number of negative words in a tweet.
#Then train your logistic regression classifier on these features.

def extract_feature(data):

  feature_extraction = np.zeros((len(data),3))

  for index ,text in enumerate(data) :
    rocess_tweet = preprocessing_twitter(text)

    #Delete the duplicate item in the list
    punique_list = list(set(rocess_tweet))

    sum_positive = 0
    sum_negetive = 0

    for word in punique_list:
      if (word,1) in word_freq:
        sum_positive = sum_positive+word_freq[(word,1)]
      if (word,0) in word_freq:
        sum_negetive = sum_negetive+word_freq[(word,0)]

    feature_extraction[index,:] = [1,sum_positive,sum_negetive]

  return feature_extraction




In [77]:
X=extract_feature(data)
print(X)

[[1.000e+00 3.887e+03 7.200e+01]
 [1.000e+00 4.613e+03 5.180e+02]
 [1.000e+00 3.883e+03 1.610e+02]
 ...
 [1.000e+00 3.400e+01 4.653e+03]
 [1.000e+00 1.000e+01 4.598e+03]
 [1.000e+00 1.160e+02 5.900e+02]]


In [78]:
Xpositive= X[0:len(all_positive_tweets)]
Xnegetive = X[len(all_positive_tweets):]

Xtrain = np.concatenate(( Xpositive[0:int(len(Xpositive)*0.8)], Xnegetive[0:int(len(Xnegetive)*0.8)]))
Xtest =  np.concatenate(( Xpositive[int(len(Xpositive)*0.8):], Xnegetive[int(len(Xpositive)*0.8):]))

In [79]:
def sigmoid(Z):
  result = 1/(1+ np.exp(-Z))
  return result

In [103]:
def cost(h,y):

  m = len(y)
  y = np.array([y])

  result_cost = -1/m*( np.dot(y,np.log(h)) +  np.dot((1-y),np.log(1-h)))

  return result_cost



In [120]:
def update_weight(X,Y,h,alpha,theta):
  m = len(Y)
  y = np.array([Y])

  geradian = -alpha/m*(np.dot((h-y.T).T,X))
  theta = theta - geradian

  print(theta)
  return 0

In [121]:
#train
def train(X,Y,initial_theta,):

  z = np.dot(X, initial_theta)

  h = sigmoid(z)
  result_cost = cost(h,Y)

  update_weight(X,Y,h,1e-9,initial_theta)



In [122]:
train(Xtrain,lable_train,np.zeros((3,1)))

[[ 0.00000000e+00 -7.37977500e-07  1.03907331e-06]
 [ 0.00000000e+00 -7.37977500e-07  1.03907331e-06]
 [ 0.00000000e+00 -7.37977500e-07  1.03907331e-06]]


In [None]:
import numpy as np

# Define two matrices
A = np.array([[1, 2]])
B = np.array([[5, 6], [7, 8]])

print(A)
# Calculate the dot product
result = np.dot(A, B)

print(result)


[[1 2]]
[[19 22]]
