In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Installing Spacy library

!pip install spacy==3.1.1
!pip install spacy-transformers

Collecting spacy==3.1.1
  Downloading spacy-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 1.5 MB/s 
Collecting thinc<8.1.0,>=8.0.8
  Downloading thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (628 kB)
[K     |████████████████████████████████| 628 kB 48.4 MB/s 
[?25hCollecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[K     |████████████████████████████████| 451 kB 54.2 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 49.0 MB/s 
[?25hCollecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting catalogue<2.1.0,>=2.0.4
  Downloading catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     

In [3]:
# Downloading the spaCy Transformer model "en_core_web_trf"
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.2.0/en_core_web_trf-3.2.0-py3-none-any.whl (460.2 MB)
[K     |████████████████████████████████| 460.2 MB 34 kB/s 
Installing collected packages: en-core-web-trf
Successfully installed en-core-web-trf-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [4]:
# Importing libraries

import pandas as pd
from datetime import datetime
import spacy
import spacy_transformers

# Storing docs in binary format
from spacy.tokens import DocBin

In [5]:
DATASET_PATH = '/content/drive/My Drive/BERT_SPACY/'

In [7]:
# Reading the dataset
COLNAMES = ['Sentiment','tweet_id', 'Text']
df = pd.read_csv((DATASET_PATH+"retweet_dataset.tsv"), sep='\t', names=COLNAMES)
df.head()

Unnamed: 0,Sentiment,tweet_id,Text
0,negative,1222226229446750214,Deputy White House counsel Pat Philbin on the ...
1,negative,1236289649737371648,Panic buying and stockpiling of toilet roll co...
2,positive,1237489705090002944,Not Available
3,negative,1222258314857172993,"It’s Super Bowl week, and Trump is using a pre..."
4,negative,1236432648374857728,Turns out Donald Trump was potentially exposed...


In [9]:
df.drop(labels=['tweet_id'], axis=1, inplace=True)
df.head()

Unnamed: 0,Sentiment,Text
0,negative,Deputy White House counsel Pat Philbin on the ...
1,negative,Panic buying and stockpiling of toilet roll co...
2,positive,Not Available
3,negative,"It’s Super Bowl week, and Trump is using a pre..."
4,negative,Turns out Donald Trump was potentially exposed...


In [10]:
df = df[(df.Text != "Not Available")]

df.head()

Unnamed: 0,Sentiment,Text
0,negative,Deputy White House counsel Pat Philbin on the ...
1,negative,Panic buying and stockpiling of toilet roll co...
3,negative,"It’s Super Bowl week, and Trump is using a pre..."
4,negative,Turns out Donald Trump was potentially exposed...
5,positive,Had a totally surreal author moment at B&amp;N...


In [11]:
df.Sentiment.value_counts()

negative    467
positive    387
neutral     360
Name: Sentiment, dtype: int64

In [12]:
df.shape

(1214, 2)

In [13]:
#Splitting the dataset into train and test
train = df.sample(frac = 0.8, random_state = 25)
test = df.drop(train.index)

In [14]:
train.shape, test.shape

((971, 2), (243, 2))

In [15]:
import spacy
nlp=spacy.load("en_core_web_trf")

In [16]:
#Creating tuples
train['tuples'] = train.apply(lambda row: 
                              (row['Text'],row['Sentiment']), axis=1)
train = train['tuples'].tolist()

test['tuples'] = test.apply(lambda row:
                            (row['Text'],row['Sentiment']), axis=1)
test = test['tuples'].tolist()

train[0]

('South Africa is already facing an economic crisis, loadshedding, an already overburdened health system, not enough… https://t.co/mDeGndHTQq',
 'negative')

In [17]:
# User function for converting the train and test dataset into spaCy document
def document(data):
#Creating empty list called "text"  
  text = []
  for doc, label in nlp.pipe(data, as_tuples = True):
    if (label=='positive'):
      doc.cats['positive'] = 1
      doc.cats['negative'] = 0
      doc.cats['neutral'] = 0
    elif (label=='neutral'):
      doc.cats['positive'] = 0
      doc.cats['neutral'] = 1
      doc.cats['negative'] = 0
    else:
      doc.cats['positive'] = 0
      doc.cats['neutral'] = 0
      doc.cats['negative'] = 1
#Adding the doc into the list 'text'
      text.append(doc)
  return(text)

In [19]:
# Calculate the time for converting into binary document for train dataset

start_time = datetime.now()

#passing the train dataset into function 'document'
train_docs = document(train)

#Creating binary document using DocBin function in spaCy
doc_bin = DocBin(docs = train_docs)

#Saving the binary document as train.spacy
doc_bin.to_disk("/content/drive/My Drive/BERT_SPACY/train.spacy")
end_time = datetime.now()

#Printing the time duration for train dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:03:25.372738


In [None]:
!pip freeze requirements.txt

In [20]:
# Calculate the time for converting into binary document for test dataset

start_time = datetime.now()

#passing the test dataset into function 'document'
test_docs = document(test)
doc_bin = DocBin(docs = test_docs)
doc_bin.to_disk("/content/drive/My Drive/BERT_SPACY/valid.spacy")
end_time = datetime.now()

#Printing the time duration for test dataset
print('Duration: {}'.format(end_time - start_time))

Duration: 0:00:41.366028


In [21]:
#Converting base configuration into full config file

!python -m spacy init fill-config /content/drive/My\ Drive/BERT_SPACY/base_config.cfg ./config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [22]:
#Calculating the time for training the model
start_time = datetime.now()

# To train the model. Enabled GPU and storing the model output in folder called output_updated
!python -m spacy train config.cfg --verbose  --gpu-id 0 --output /content/drive/My\ Drive/BERT_SPACY/output_updated/

end_time = datetime.now()

#Printing the time taken for training the model
print('Duration: {}'.format(end_time - start_time))

[38;5;4mℹ Saving to output directory: /content/drive/My
Drive/BERT_SPACY/output_updated[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-01-16 10:10:41,952] [INFO] Set up nlp object from config
[2022-01-16 10:10:41,964] [DEBUG] Loading corpus from path: /content/drive/My Drive/BERT_SPACY/valid.spacy
[2022-01-16 10:10:41,965] [DEBUG] Loading corpus from path: /content/drive/My Drive/BERT_SPACY/train.spacy
[2022-01-16 10:10:41,965] [INFO] Pipeline: ['transformer', 'textcat']
[2022-01-16 10:10:41,970] [INFO] Created vocabulary
[2022-01-16 10:10:41,972] [INFO] Finished initializing nlp object
Downloading: 100% 481/481 [00:00<00:00, 604kB/s]
Downloading: 100% 878k/878k [00:00<00:00, 5.00MB/s]
Downloading: 100% 446k/446k [00:00<00:00, 3.01MB/s]
Downloading: 100% 1.29M/1.29M [00:00<00:00, 7.60MB/s]
Downloading: 100% 478M/478M [00:12<00:00, 40.5MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_h

In [None]:
while True: pass

KeyboardInterrupt: ignored

### Testing

In [24]:
text = "Australia's largest airline temporarily lays off 2,500 employees"# Loading the best model from output_updated folder

nlp = spacy.load("/content/drive/My Drive/BERT_SPACY/output_updated/model-best")
demo = nlp(text)
print(demo.cats)

{'positive': 3.0671328659082064e-06, 'neutral': 3.0671328659082064e-06, 'negative': 0.9999939203262329}


In [25]:
text = "I love you babe"# Loading the best model from output_updated folder

nlp = spacy.load("/content/drive/My Drive/BERT_SPACY/output_updated/model-best")
demo = nlp(text)
print(demo.cats)

{'positive': 3.4221570786030497e-06, 'neutral': 3.4221570786030497e-06, 'negative': 0.9999932050704956}


In [26]:
text = "I hate your face"# Loading the best model from output_updated folder

nlp = spacy.load("/content/drive/My Drive/BERT_SPACY/output_updated/model-best")
demo = nlp(text)
print(demo.cats)

{'positive': 3.4222975955344737e-06, 'neutral': 3.4222975955344737e-06, 'negative': 0.9999932050704956}


In [27]:
text = "you have nice hair but i do not like this hair style"# Loading the best model from output_updated folder

nlp = spacy.load("/content/drive/My Drive/BERT_SPACY/output_updated/model-best")
demo = nlp(text)
print(demo.cats)

{'positive': 3.009255578945158e-06, 'neutral': 3.009255578945158e-06, 'negative': 0.9999940395355225}
