<a href="https://colab.research.google.com/github/AhmedZeer/ml.py/blob/master/GC__SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyprind

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pyprind
import pandas as pd
import os
import sys

## Preparing The Data

In [None]:
df = pd.DataFrame()

In [None]:
basepath = "../../../assets/aclImdb/"
labels = { "pos" : 1, "neg" : 0 }
pbar = pyprind.ProgBar( 50000, stream=sys.stdout )
df = pd.DataFrame()

for s in ( "test", "train" ):
    for l in ( "pos", "neg" ):
        pth = os.path.join(basepath,s,l)
        for file in sorted(os.listdir(pth)):
            with open(os.path.join(pth,file), 'r', encoding="utf-8" ) as infile:
                txt = infile.read()
                tmp = pd.DataFrame( [ {txt, labels[l]} ] )
                df  = pd.concat( [ df, tmp ], ignore_index = True )
                pbar.update()



In [None]:
import numpy as np
df.columns = ["label", "sentiment"]
np.random.seed(0)
df = df.reindex( np.random.permutation( df.index ) )

In [None]:
df.to_csv("movie_data.csv", index = False, encoding="utf-8")

In [None]:
import pandas as pd

In [6]:
df = pd.read_csv("/content/drive/MyDrive/movie_data.csv")

In [None]:
path = os.path.join( basepath, "test", "pos" )

## Bag Of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count = CountVectorizer()

In [None]:
sentences = np.array( ['Hello <repeated> this is a text',
                      'Another different <repeated> <repeated> <repeated> text',
                      'what a <repeated> corpus <repeated> you got !'])

In [None]:
bag = count.fit_transform( sentences )

In [None]:
count.vocabulary_

{'hello': 4,
 'repeated': 6,
 'this': 8,
 'is': 5,
 'text': 7,
 'another': 0,
 'different': 2,
 'what': 9,
 'corpus': 1,
 'you': 10,
 'got': 3}

In [None]:
'''
    Each row represents a document, and the values in these vectors stands for
    raw term frequencies.
'''

In [None]:
bag.toarray()

array([[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0],
       [1, 0, 1, 0, 0, 0, 3, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 2, 0, 0, 1, 1]])

## Term frequency-inverse document frequency ( tf-idf )

In [None]:
'''
    * When we try to classify documents from both classes we encounter words that
    don't belong to any of these classes. We tend to exclude these words from
    our data.

    * the tf-idf can be identified as the product of `term frequncy` and `inverse
    document frequency`

'''

$$\Large{idf( t, d ) = \log{\frac{n_d}{1+df(d,t)}}}$$

* $\large{n_d}: \small{\text{Total Number Of Docs}}$
* ${df(d,t)}: \small{\text{The number of total documents, d, that contains the term, t.}}$
* The `1` is used to elimnate the possibililty of zero denominator.
* The `log` is used to ensure that low document frequency does not gain too much weight.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer( use_idf = True, norm = 'l2', smooth_idf=True )

In [None]:
np.set_printoptions( precision=2 )
print( tfidf.fit_transform( count.fit_transform( sentences ) ).toarray() )

[[0.   0.   0.   0.   0.5  0.5  0.3  0.38 0.5  0.   0.  ]
 [0.42 0.   0.42 0.   0.   0.   0.74 0.32 0.   0.   0.  ]
 [0.   0.43 0.   0.43 0.   0.   0.51 0.   0.   0.43 0.43]]


## Using Regex to Clean Text Data

In [None]:
import re

In [None]:
def preprocessor(text):
    text = re.sub( r"<[^>]*>", " ", text )
    emoticosn = re.findall( r"(?::|;|=)(?:-)?(?:\(|\)|P|D)", text )
    text = re.sub( r"[\W]+", " ", text.lower() )
    return (text + ' '.join(emoticosn))

In [None]:
emoticosn = re.findall( r"(?::|;|=)(?:-)?(?:\(|\)|P|D)", "asdijasd :( =) :)")

In [None]:
preprocessor(":)")

' :)'

In [None]:
df.loc[0, 'sentiment']

"at a Saturday matinee in my home town. I went with an older friend (he was about 12) and my mom let me go because she thought the film would be OK (it's rated G). I was assaulted by loud music, STRANGE images, no plot and a stubborn refusal to make ANY sense. We left halfway through because we were bored, frustrated and our ears hurt. <br /><br />I saw it 22 years later in a revival theatre. My opinion had changed--it's even WORSE! Basically everything I hated about it was still there and the film was VERY 60s...and has dated badly. I got all the little in-jokes...too bad they weren't funny. The constant shifts in tone got quickly annoying and there's absolutely nothing to get a firm grip on. Some people will love this. I found it frustrating...by the end of the film I felt like throwing something heavy at the screen.<br /><br />Also, all the Monkees songs in this movie SUCK (and I DO like them).<br /><br />For ex-hippies only...or if you're stoned. I give this a 1."

In [None]:
preprocessor(df.loc[0, 'sentiment'])

'at a saturday matinee in my home town i went with an older friend he was about 12 and my mom let me go because she thought the film would be ok it s rated g i was assaulted by loud music strange images no plot and a stubborn refusal to make any sense we left halfway through because we were bored frustrated and our ears hurt i saw it 22 years later in a revival theatre my opinion had changed it s even worse basically everything i hated about it was still there and the film was very 60s and has dated badly i got all the little in jokes too bad they weren t funny the constant shifts in tone got quickly annoying and there s absolutely nothing to get a firm grip on some people will love this i found it frustrating by the end of the film i felt like throwing something heavy at the screen also all the monkees songs in this movie suck and i do like them for ex hippies only or if you re stoned i give this a 1 '

In [None]:
df['sentiment'].apply(preprocessor)

0        at a saturday matinee in my home town i went w...
1        i love this movie it is the first film master ...
2        in the voice over which begins the film hughie...
3         spoiler alert the point is though that i didn...
4        this is an excellent film no it s not mel gibs...
                               ...                        
49995    although the director tried the filming was ma...
49996    it has been about 50 years since a movie has b...
49997     bar hopping seems to be trying to be about th...
49998    this awful effort just goes to show what happe...
49999    yes why among the filmmakers that came out in ...
Name: sentiment, Length: 50000, dtype: object

## Tokenizing

In [None]:
def tokenizer(text):
    return( text.split() )

In [None]:
tokenizer("split me !")

['split', 'me', '!']

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
porter = PorterStemmer()

In [None]:
def tokenizer_porter( text ):
    return [ porter.stem(word) for word in text.split() ]

In [None]:
tokenizer_porter("Testing how stemmers work they are pretty good ")

['test', 'how', 'stemmer', 'work', 'they', 'are', 'pretti', 'good']

## Stop-Word Removal

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ahmed4/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords

In [None]:
stop = stopwords.words("english")

In [None]:
text = "He was pretty good in describing his feelings about this scene. However, I think it was really awkward ..."

In [None]:
[w for w in tokenizer_porter( text ) if w not in stop ]

['wa',
 'pretti',
 'good',
 'describ',
 'hi',
 'feel',
 'thi',
 'scene.',
 'however,',
 'think',
 'wa',
 'realli',
 'awkward',
 '...']

## Training a logistic clf.

In [None]:
X_train = df.loc[25000:, 'sentiment'].values
X_test = df.loc[:25000, 'sentiment'].values
y_train = df.loc[25000:, 'label'].values
y_test = df.loc[:25000, 'label'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [None]:
tfidf.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': False,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [None]:
small_grid_param = [
    {
        'vect__ngram_range':[(1,1)],
        'vect__stop_words' :[None],
        'vect__tokenizer'  :[tokenizer, tokenizer_porter],
        'clf__penalty'     :['l2'],
        'clf__C'           :[1.0,10.0]
    },
    {
        'vect__ngram_range':[(1,1)],
        'vect__stop_words' :[stop, None],
        'vect__tokenizer'  :[tokenizer],
        'vect__norm'       :[None],
        'vect__use_idf'    :[False],
        'clf__penalty'     :['l2'],
        'clf__C'           :[1.0,10.0]
    }
]

In [None]:
lr_tfidf = Pipeline( [ ('vect', tfidf), ("clf", LogisticRegression( solver="liblinear" ) ) ] )

In [None]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_grid_param, scoring='accuracy', cv=5, verbose=2, n_jobs=-1 )

In [None]:
gs_lr_tfidf.fit(X_train, y_train )

## Out Of Core Learning

In [7]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
stop = stopwords.words("english")

In [55]:
text = df.loc[0,"sentiment"]

In [53]:
def tokenizer(text):
  icons_pattern = "(?::|;|=)+(?:-|=)?(?:\(|\)|P|D)+"
  text = re.sub("<[^>]*>",'',text)
  emoticons = re.findall(icons_pattern,text)
  text = re.sub('[\W]+',' ',text)
  text = text.lower()
  text = text + ' '+ ''.join(emoticons)
  ret = [w for w in text.split() if w not in stop]
  return ret

In [54]:
tokenizer("blah blah :) <whatever> placeholer")

['blah', 'blah', 'placeholer', ':)']

In [174]:
def stream_docs( path ):
  df_tmp = pd.read_csv(path)
  for idx, (label,data) in df_tmp.iterrows():
    yield (int(label), data)

In [175]:
next(stream_docs("/content/drive/MyDrive/movie_data.csv"))

(0,
 "at a Saturday matinee in my home town. I went with an older friend (he was about 12) and my mom let me go because she thought the film would be OK (it's rated G). I was assaulted by loud music, STRANGE images, no plot and a stubborn refusal to make ANY sense. We left halfway through because we were bored, frustrated and our ears hurt. <br /><br />I saw it 22 years later in a revival theatre. My opinion had changed--it's even WORSE! Basically everything I hated about it was still there and the film was VERY 60s...and has dated badly. I got all the little in-jokes...too bad they weren't funny. The constant shifts in tone got quickly annoying and there's absolutely nothing to get a firm grip on. Some people will love this. I found it frustrating...by the end of the film I felt like throwing something heavy at the screen.<br /><br />Also, all the Monkees songs in this movie SUCK (and I DO like them).<br /><br />For ex-hippies only...or if you're stoned. I give this a 1.")

In [176]:
def mini_batch(docs, batch_size):
  data  = []
  label = []
  try:
    for _ in range(batch_size):
      l, d = next(stream_docs(docs))
      label.append(l)
      data.append(d)
  except StopIteration:
    return None,None

  return label,data

In [178]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [191]:
hv = HashingVectorizer( n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer,
                        decode_error = 'ignore')

In [180]:
clf = SGDClassifier(loss='log', random_state=0)

In [183]:
!pip install pyprind



In [185]:
import pyprind

In [None]:
pbar = pyprind.ProgBar(450)
for _ in range(450):
  X_train, y_train = mini_batch("/content/drive/MyDrive/movie_data.csv", 100)
  if not X_train:
    break
  X_train = hv.transform(X_train)
  clf.partial_fit(X_train, y_train)
  pbar.update

## Latent Drichlet Allocation

In [None]:
# df.rename(columns={"0":"label", "1":"sentiment"})

In [215]:
def clean_text(text):
  icons_pattern = "(?::|;|=)+(?:-|=)?(?:\(|\)|P|D)+"
  text = re.sub("<[^>]*>",'',text)
  emoticons = re.findall(icons_pattern,text)
  text = re.sub('[\W]+',' ',text)
  text = text.lower()
  text = text + ' '+ ''.join(emoticons)
  ret = [w for w in text.split() if w not in stop]
  return ' '.join(ret)

In [196]:
from sklearn.feature_extraction.text import CountVectorizer

In [218]:
df['sentiment'].apply(clean_text)

0        saturday matinee home town went older friend 1...
1        love movie first film master p ever done based...
2        voice begins film hughie billy connolly roadie...
3        spoiler alert point though think film ending s...
4        excellent film mel gibson braveheart trying ac...
                               ...                        
49995    although director tried filming made tynisia m...
49996    50 years since movie made romance mysticism tw...
49997    bar hopping seems trying stereotypical bar ten...
49998    awful effort goes show happens use computers g...
49999    yes among filmmakers came 80 90 gus van sant o...
Name: sentiment, Length: 50000, dtype: object

In [219]:
cv = CountVectorizer( stop_words="english", max_df=.1, max_features=5000 )
X = cv.fit_transform(df['sentiment'].values)

In [220]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=20)

In [221]:
X_topics = lda.fit_transform(X)

In [206]:
lda.components_.shape

(10, 5000)

In [233]:
cv.get_feature_names_out()[4999]

'zone'

In [223]:
feature_name = cv.get_feature_names_out()
n_top_words = 5
for idx, topic in enumerate(lda.components_):
  print(feature_name[idx])

00
000
100
11
12
13
13th
14
15
16
