<a href="https://colab.research.google.com/github/ASIF-Mahmud1/Exploration/blob/text-classifier/NLP_Lessons/BERT/EnronDatasetDiminsihingTerms_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Install torchvision
!pip install torchvision




In [2]:
import torch

In [3]:
from typing import Callable, List, Optional, Tuple

import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 60,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.tokenizer.encode_plus(text,
                                                    add_special_tokens=True,
                                                    max_length=self.max_length
                                                    )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [4]:
!pip install transformers


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 5.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 19.7MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 20.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=79967

In [5]:
from transformers import BertTokenizer, BertModel
import torch
bert_model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [6]:
bert_transformer = BertTransformer(tokenizer, bert_model)
from sklearn.pipeline import Pipeline


In [7]:
url="https://raw.githubusercontent.com/ASIF-Mahmud1/Exploration/text-classifier/DiminishingTerms/dataSet.csv"
from io import StringIO
import string
import pandas as pd
import requests
s=requests.get(url).text

message_data=pd.read_csv(StringIO(s))
message_data['tag'] = message_data['tag'].str.strip()

message_data.head()
message_data['tag'].describe()

count             315
unique              2
top       diminishing
freq              234
Name: tag, dtype: object

In [8]:
message_data_copy = message_data['sentence'].copy() 

# message_data_copy

In [9]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs',C=5.2)


In [10]:
pipeline = Pipeline(steps=  [
        ("vectorizer", bert_transformer),
        ("classifier", classifier),
    ] )

pipeline.fit(message_data_copy,  message_data['tag'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('vectorizer',
                 BertTransformer(bert_model=None, bert_tokenizer=None,
                                 embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x7eff1f8010e0>,
                                 max_length=60)),
                ('classifier',
                 LogisticRegression(C=5.2, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='multinomial', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [11]:
import pandas as pd
df = pd.read_csv('https://query.data.world/s/yqgskd2yg2lztwtgmpeajy75msskd6',error_bad_lines=False, index_col=False, dtype='unicode')
df = df.applymap(str) ## stringify all fields


In [12]:
sender= [
  "susan.bailey@enron.com",
	"sally.beck@enron.com",
  "lynn.blair@enron.com",
  "sandra.brawner@enron.com",
  "michelle.cash@enron.com",
  "monika.causholli@enron.com",
  "stacy.dickson@enron.com",
]

senderFinal=[]
for email in sender:
  email="frozenset({'" + email+"'})"
  senderFinal.append(email)


# femaleEmail= df.loc[df['From'].isin(senderFinal)]
# femaleEmail.head()

df= df.loc[df['From'].isin(senderFinal)]

print("Size of Dataset: ",df.shape[0])


Size of Dataset:  9167


In [13]:
collectionSize=10
startIndex=0
endIndex=startIndex + collectionSize

# Test Sentences Here
### **Note**: Click Play to Predict next 10 emails

In [14]:
####################################################################################################################
                       ## Remove this later
import pandas as pd
import altair as alt
import numpy as np
text= df['content']  # all data
# text= femaleEmail['content']    # all female data
text= text[startIndex:endIndex]
prob_Of_Each_Class = pipeline.predict_proba(text)
X = np.arange(startIndex,endIndex)
y_strong= prob_Of_Each_Class[:,1]*100
y_diminishing=prob_Of_Each_Class[:,0]*100    

dfTest = pd.DataFrame(columns=('index', 'strong', 'diminish','sentence'))
dfTest['index']=X 
dfTest['strong']=y_strong 
dfTest['diminish']=y_diminishing
dfTest['sentence']=list(text)
gp_chart = (
  alt.Chart(dfTest,width=50 ,padding={"left": 50, "top": 5, "right": 5, "bottom": 5} )
    .transform_fold(["diminish", "strong"], as_=["key", "value"])
    .mark_bar()  # size=50
    .encode(
        x="key:N",
        y="value:Q",
        color="key:N",
        column="index",
        tooltip="sentence"
    )   # .interactive() 
)

startIndex= startIndex+collectionSize
endIndex=startIndex+collectionSize
gp_chart.display()

    
for index, row in enumerate(text.index):
    print(X[index],  " "+text[row], " ",X[index])    # " ",row,
    
                            ## Remove this later
####################################################################################################################

0  A backup seat and a backup computer have been assigned to you. Please read this communication in detail. Purpose: Net Works has developed a backup plan for possible work stoppages after moving to the new building. A wider business continuity plan will be rolled out next year. Location: Large areas of the 30th and 31st floors of the current building, Enron Center North, will be set aside for recovery purposes. Your name has been put on the list and you will be notified once the seat assignments are finalized. Locations will also be posted at the entryways on the 30th and 31st floors. Timing. Through November, backup seats assignments will be announced as each trading group moves to the new building. The backup seats will be available at least to January 1st. Testing. Test times to try the backup PC and to familiarize yourself with your backup location will be announced by the IT team as the locations are finished. Telephones. Only regular phones will be available. For those with spee

## Write to a CSV File

In [15]:
import pandas as pd
    
save = pd.DataFrame(dfTest, columns=  dfTest.columns)

save.to_csv (r'export_dataframe.csv', index = False, header=True)

print (save)


   index  ...                                           sentence
0      0  ...  A backup seat and a backup computer have been ...
1      1  ...  A backup seat and a backup computer have been ...
2      2  ...  Kelley, Looks fine to me. Susan Bailey -----Or...
3      3  ...  Ladies, I will be out for a day of vacation --...
4      4  ...  Russell, Best wishes & much success on your ne...
5      5  ...  Kay, The "Financial Trading Database" under Lo...
6      6  ...  Sara, Please advise if I may be of assistance ...
7      7  ...  Bob, As mentioned in my voice mail -- I can me...
8      8  ...  Bob, I will see you in your office @ 3:30pm. S...
9      9  ...  Diane or Patrick, Please email or provide hard...

[10 rows x 4 columns]


## Save whole email corpus in a CSV file
 

In [16]:
import pandas as pd
import altair as alt
import numpy as np
text= df['content']  # all data
#text= text[0:10]     # first 10 dataset
prob_Of_Each_Class = pipeline.predict_proba(text)
tags= pipeline.predict(text)


In [22]:
startIndex=0
endIndex= df.shape[0]

X = np.arange(startIndex,endIndex)
y_strong= prob_Of_Each_Class[:,1]*100
y_diminishing=prob_Of_Each_Class[:,0]*100    
y_strong 
dfTest = pd.DataFrame(columns=('index', 'strong', 'diminish','sentence'))
dfTest['index']=X 
dfTest['strong']=y_strong 
dfTest['diminish']=y_diminishing
dfTest['sentence']=list(text)
dfTest.insert(1, 'tag', list(tags) )
dfTest['source']= "Enron email"
dfTest

Unnamed: 0,index,sentence,source
0,0,A backup seat and a backup computer have been ...,Enron email
1,1,A backup seat and a backup computer have been ...,Enron email
2,2,"Kelley, Looks fine to me. Susan Bailey -----Or...",Enron email
3,3,"Ladies, I will be out for a day of vacation --...",Enron email
4,4,"Russell, Best wishes & much success on your ne...",Enron email
...,...,...,...
9162,9162,"Bianca, Attached is the template for Annexes B...",Enron email
9163,9163,"Cara, Tag 22874 got curtailed Day Ahead from 5...",Enron email
9164,9164,"Hi Bill, Sorry I haven't been alive earlier bu...",Enron email
9165,9165,I realize that the timing of the Enron Net Wor...,Enron email


In [24]:
import pandas as pd
    
save = pd.DataFrame(dfTest, columns=  dfTest.columns)

save.to_csv (r'enronEmailWithoutTag.csv', index = False, header=True)

print (save)


      index                                           sentence       source
0         0  A backup seat and a backup computer have been ...  Enron email
1         1  A backup seat and a backup computer have been ...  Enron email
2         2  Kelley, Looks fine to me. Susan Bailey -----Or...  Enron email
3         3  Ladies, I will be out for a day of vacation --...  Enron email
4         4  Russell, Best wishes & much success on your ne...  Enron email
...     ...                                                ...          ...
9162   9162  Bianca, Attached is the template for Annexes B...  Enron email
9163   9163  Cara, Tag 22874 got curtailed Day Ahead from 5...  Enron email
9164   9164  Hi Bill, Sorry I haven't been alive earlier bu...  Enron email
9165   9165  I realize that the timing of the Enron Net Wor...  Enron email
9166   9166  I left you a couple of messages with two diffe...  Enron email

[9167 rows x 3 columns]


## Add New Columns

In [None]:
cars = {'Brand': ['Honda Civic','Toyota Corolla','Ford Focus','Audi A4'],
        'Price': [22000,25000,27000,35000]
        }


cars= pd.DataFrame(cars, columns=  ['Brand','Price'])
cars['Address'] = "Lol"   
cars

# cars[cars.columns[:3]]

## Paragraph to Sentence

In [None]:
import nltk
nltk.download('punkt')
para= save['sentence'][1]
tokens=[]
print(para)
tokens = nltk.sent_tokenize(para)
print(tokens)

for index, row in enumerate(save.sentence):
    print(index,row) 