In [1]:
#pip install lxml
#pip install -U textblob 

# WEB SCRAPING and SENTIMENTAL ANALYSYS  

From a eu dairy milk forecast article, I will analyse if the article is sentimental positive or negative. F Beautiful Soup ill be used to extract the article form the web.

Analysis will be run paragraph by paragraph using multiple techniques. the mean of the paragraphs will be done to determine if the overall article is positive or negative.

# 0 Web Scraping 

In [2]:
import requests

from bs4                               import BeautifulSoup
from textblob                          import TextBlob
from nltk.corpus                       import stopwords
from nltk.stem                         import PorterStemmer
from textblob                          import Word
from sklearn.feature_extraction.text   import TfidfVectorizer
from sklearn.feature_extraction.text   import CountVectorizer


import pandas            as pd
import numpy             as np

import warnings
warnings.filterwarnings("ignore")

In [3]:
#import article using beautiful soup
url = 'http://foodingredientsfirst.com/news/eu-dairy-industry-forecasts-uncertain-2023-as-milk-availability-is-no-longer-abundant.html'
r = requests.get(url)
s = BeautifulSoup(r.content, 'html.parser')
a = []
for text in s.find_all('p'):
    b = text.get_text()
    a.append(b)
print(a)

['\r\n                                        By continuing to browse our site you agree to our Privacy & Cookie Policy.\r\n                                         > Privacy & Cookie Policy I Agree\n', 'FREE NEWSLETTER', '13 Dec 2022 --- European dairy trading body Eucolait underscores the moral necessity of the block sharing its milk supply, albeit scarce, via free market trade at a time when societies grapple with post-pandemic headwinds, rampant inflation, supply chain shocks and the ongoing war in Ukraine.', '13 Dec 2022 --- European dairy trading body Eucolait underscores the moral necessity of the block sharing its milk supply, albeit scarce, via free market trade at a time when societies grapple with post-pandemic headwinds, rampant inflation, supply chain shocks and the ongoing war in Ukraine.', 'Despite the EU milk industry facing declining volumes, trade experts call for the EU to increase the output of milk saying that sustainability targets should not come with reduced out

# 1 Sentimental Analysis

In [4]:
# Transform list into a dataframe
text = pd.DataFrame(a, columns=['paragraph'])
text

Unnamed: 0,paragraph
0,\r\n By...
1,FREE NEWSLETTER
2,13 Dec 2022 --- European dairy trading body Eu...
3,13 Dec 2022 --- European dairy trading body Eu...
4,Despite the EU milk industry facing declining ...
5,The EU exports 20% of its milk solids and desp...
6,Redistributing better milk exports might help ...
7,"Jukka Likitalo, secretary General of Eucolait,..."
8,He notes that while dairy prices have gone dow...
9,“Sentiment has weakened and prices have come d...


In [5]:
# drop duplicates as soom paragraphs were pushed twice like 2/3, 11/12, 33/34/36/40 ...
text = text.drop_duplicates()

## 1.1 Number of Words

In [6]:
# slit based on space
text['word_count'] = text['paragraph'].apply(lambda x: len(str(x).split(" ")))

# disclose df with text / number of words per paragraph
text[['paragraph','word_count']]

Unnamed: 0,paragraph,word_count
0,\r\n By...,101
1,FREE NEWSLETTER,2
2,13 Dec 2022 --- European dairy trading body Eu...,46
4,Despite the EU milk industry facing declining ...,43
5,The EU exports 20% of its milk solids and desp...,27
6,Redistributing better milk exports might help ...,10
7,"Jukka Likitalo, secretary General of Eucolait,...",18
8,He notes that while dairy prices have gone dow...,25
9,“Sentiment has weakened and prices have come d...,72
10,“There is no shortage of uncertainty factors h...,43


In [7]:
# drop paragraphs that do not add meaning
text.drop([1, 42, 43, 44,45], axis=0, inplace=True)
text.shape

(37, 2)

## 1.2 Number of characters

In [8]:
# count the number of characters
text['char_count'] = text['paragraph'].str.len()

# disclose df with text / number of letters per paragraph
text[['paragraph','char_count']].head()

Unnamed: 0,paragraph,char_count
0,\r\n By...,193
2,13 Dec 2022 --- European dairy trading body Eu...,293
4,Despite the EU milk industry facing declining ...,271
5,The EU exports 20% of its milk solids and desp...,153
6,Redistributing better milk exports might help ...,74


## 1.3 Average Word Length

In [9]:
# define average number of letters per words
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [10]:
text['avg_word'] = text['paragraph'].apply(lambda x: avg_word(x))

In [11]:
# disclose df with text / avrage number of letters per word per paragraph
text[['paragraph','avg_word']].head()

Unnamed: 0,paragraph,avg_word
0,\r\n By...,4.190476
2,13 Dec 2022 --- European dairy trading body Eu...,5.391304
4,Despite the EU milk industry facing declining ...,5.428571
5,The EU exports 20% of its milk solids and desp...,4.703704
6,Redistributing better milk exports might help ...,6.5


## 1.4 Number of stopwords

**One of the first NLP steps is to remove stopwords**

In [12]:
# select english dictionary
stop = stopwords.words('english')

In [13]:
# find number of stopwords
text['stopwords'] = text['paragraph'].apply(lambda x: len([x for x in x.split() if x in stop]))
text[['paragraph','stopwords']].head()

Unnamed: 0,paragraph,stopwords
0,\r\n By...,5
2,13 Dec 2022 --- European dairy trading body Eu...,11
4,Despite the EU milk industry facing declining ...,15
5,The EU exports 20% of its milk solids and desp...,9
6,Redistributing better milk exports might help ...,1


## 1.5 Basic Pre-Processing
**To obtain better features I will start by cleaning data using some basic pre-processing steps**

In [14]:
#avoids having multiple copies of the same word
text['paragraph'] = text['paragraph'].apply(lambda x: " ".join(x.lower() for x in x.split()))
text['paragraph'].head()

0    by continuing to browse our site you agree to ...
2    13 dec 2022 --- european dairy trading body eu...
4    despite the eu milk industry facing declining ...
5    the eu exports 20% of its milk solids and desp...
6    redistributing better milk exports might help ...
Name: paragraph, dtype: object

In [15]:
#remove punctuation, removing all instances of it will help us reduce the size of the training data.
text['paragraph'] = text['paragraph'].str.replace('[^\w\s]','')
text['paragraph'].head()

0    by continuing to browse our site you agree to ...
2    13 dec 2022  european dairy trading body eucol...
4    despite the eu milk industry facing declining ...
5    the eu exports 20 of its milk solids and despi...
6    redistributing better milk exports might help ...
Name: paragraph, dtype: object

In [16]:
#stop words (or commonly occurring words) should be removed from the text data
text['paragraph'] = text['paragraph'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
text['paragraph'].head()

0    continuing browse site agree privacy cookie po...
2    13 dec 2022 european dairy trading body eucola...
4    despite eu milk industry facing declining volu...
5    eu exports 20 milk solids despite limited supp...
6    redistributing better milk exports might help ...
Name: paragraph, dtype: object

In [17]:
# Commonly occured words were removed but it is as well possible to remove words that are often showing in our dataset just removed commonly occurring words in a general sense. We can also
# I will check the top 10 most used
freq = pd.Series(' '.join(text['paragraph']).split()).value_counts()[:10]
freq

dairy        24
food         14
prices       14
milk         13
trade        11
eu            9
demand        7
year          7
global        7
countries     6
dtype: int64

**I will not remove dairy as it is a very important word for the analysis**

In [18]:
#Now, let’s remove these words as their presence will not of any use in classification of our text data. 
freq = list(freq.index)
text['paragraph'] = text['paragraph'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
text['paragraph'].head()

0    continuing browse site agree privacy cookie po...
2    13 dec 2022 european trading body eucolait und...
4    despite industry facing declining volumes expe...
5    exports 20 solids despite limited supply bloc ...
6    redistributing better exports might help contr...
Name: paragraph, dtype: object

## 1.6 Rare words removal
**With most common words removed, I will remove the most rare as they can be considered noise**

In [19]:
freq = pd.Series(' '.join(text['paragraph']).split()).value_counts()[-10:]
freq

best           1
climates       1
places         1
impacts        1
adverse        1
fewer          1
change         1
climate        1
mitigating     1
netherlands    1
dtype: int64

In [20]:
freq = list(freq.index)
text['paragraph'] = text['paragraph'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
text['paragraph'].head()

0    continuing browse site agree privacy cookie po...
2    13 dec 2022 european trading body eucolait und...
4    despite industry facing declining volumes expe...
5    exports 20 solids despite limited supply bloc ...
6    redistributing better exports might help contr...
Name: paragraph, dtype: object

## 1.7 Spelling correction
**Spelling correction is usefull as well because it helps in reducing multiple copies of words. Considering that is a news story, this had been through this before publishing**

In [21]:
text['paragraph'].apply(lambda x: str(TextBlob(x).correct()))

0     continuing brows site agree privacy cook polic...
2     13 dec 2022 european trading body eucolait und...
4     despite industry facing declining volumes expe...
5     exports 20 solids despite limited supply bloc ...
6     redistributing better exports might help contr...
7     julia likitalo secretary general eucolait tell...
8     notes gone last months still uncertainty volat...
9     sentiment weakened come considerably last mont...
10    shortage uncertainty factors however energy in...
11    likitalo explains must sensible partners futur...
12    likitalo explains must sensible partners futur...
13    sustainable system uses scarce natural resourc...
14    enhanced sustainability european sector go han...
15    basic essence connect surplus deficit helps re...
16    internally likitalo highlight crucial security...
17    riseeucolait explains despite international ec...
18    notably weak chinese explains fall internation...
19    imports china 20 yearonyear according agri

## 1.8 Tokenization
**This steps refers to dividing the text into a sequence of words or sentences. First transform everything into a blob and then
converts into a series of words**

In [22]:
token =[]

for i in text.index:
    token.append(TextBlob(text['paragraph'][i]).words)
    
token

[WordList(['continuing', 'browse', 'site', 'agree', 'privacy', 'cookie', 'policy', 'privacy', 'cookie', 'policy', 'agree']),
 WordList(['13', 'dec', '2022', 'european', 'trading', 'body', 'eucolait', 'underscores', 'moral', 'necessity', 'block', 'sharing', 'supply', 'albeit', 'scarce', 'via', 'free', 'market', 'time', 'societies', 'grapple', 'postpandemic', 'headwinds', 'rampant', 'inflation', 'supply', 'chain', 'shocks', 'ongoing', 'war', 'ukraine']),
 WordList(['despite', 'industry', 'facing', 'declining', 'volumes', 'experts', 'call', 'increase', 'output', 'saying', 'sustainability', 'targets', 'come', 'reduced', 'outputs', 'flagging', 'production', 'might', 'important', 'issue', 'restrictions']),
 WordList(['exports', '20', 'solids', 'despite', 'limited', 'supply', 'bloc', 'duty', 'contribute', 'security', 'according', 'eucolait']),
 WordList(['redistributing', 'better', 'exports', 'might', 'help', 'contribute', 'security']),
 WordList(['jukka', 'likitalo', 'secretary', 'general', 

## 1.9 Stemming
**Tihs steps refers to the removal of suffices, like “ing”, “ly”, “s”...**

In [23]:
st = PorterStemmer()
text['paragraph'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0     continu brows site agre privaci cooki polici p...
2     13 dec 2022 european trade bodi eucolait under...
4     despit industri face declin volum expert call ...
5     export 20 solid despit limit suppli bloc duti ...
6     redistribut better export might help contribut...
7     jukka likitalo secretari gener eucolait tell f...
8     note gone last month still uncertainti volatil...
9     sentiment weaken come consider last month prod...
10    shortag uncertainti factor howev energi input ...
11    likitalo explain must sensibl partner futur av...
12    likitalo explain must sensibl partner futur av...
13    sustain system use scarc natur resourc effici ...
14    enhanc sustain european sector go handinhand d...
15    basic essenc connect surplus deficit help regi...
16    intern likitalo highlight crucial secur standp...
17    riseeucolait explain despit intern econom woe ...
18    notabl weak chines explain fall intern neverth...
19    import china 20 yearonyear accord agricult

## 1.10 Lemmatization
**Lemmatization is a more effective option than stemming because it converts the word into its
root word, rather than just stripping the suffices**

In [24]:
text['paragraph'] = text['paragraph'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
text['paragraph'].head()

0    continuing browse site agree privacy cookie po...
2    13 dec 2022 european trading body eucolait und...
4    despite industry facing declining volume exper...
5    export 20 solid despite limited supply bloc du...
6    redistributing better export might help contri...
Name: paragraph, dtype: object

## 1.11 Advance Text Processing
**N-grams are the combination of multiple words used together. Ngrams with N=1 are called
unigrams.**

In [25]:
adv =[]

for i in text.index:
    adv.append(TextBlob(text['paragraph'][i]).ngrams(3))
    
adv


[[WordList(['continuing', 'browse', 'site']),
  WordList(['browse', 'site', 'agree']),
  WordList(['site', 'agree', 'privacy']),
  WordList(['agree', 'privacy', 'cookie']),
  WordList(['privacy', 'cookie', 'policy']),
  WordList(['cookie', 'policy', 'privacy']),
  WordList(['policy', 'privacy', 'cookie']),
  WordList(['privacy', 'cookie', 'policy']),
  WordList(['cookie', 'policy', 'agree'])],
 [WordList(['13', 'dec', '2022']),
  WordList(['dec', '2022', 'european']),
  WordList(['2022', 'european', 'trading']),
  WordList(['european', 'trading', 'body']),
  WordList(['trading', 'body', 'eucolait']),
  WordList(['body', 'eucolait', 'underscore']),
  WordList(['eucolait', 'underscore', 'moral']),
  WordList(['underscore', 'moral', 'necessity']),
  WordList(['moral', 'necessity', 'block']),
  WordList(['necessity', 'block', 'sharing']),
  WordList(['block', 'sharing', 'supply']),
  WordList(['sharing', 'supply', 'albeit']),
  WordList(['supply', 'albeit', 'scarce']),
  WordList(['albeit'

## 1.12 Term frequency
**This step does the ratio of the count of a word present in a sentence, to the length of the sentence.**

TF = (Number of times term T appears in the particular row) / (number of terms in that row)

In [26]:
tf1 = (text['paragraph'][0:37]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,agree,2.0
1,privacy,2.0
2,cookie,2.0
3,policy,2.0
4,continuing,1.0
...,...,...
416,hosted,1.0
417,cns,1.0
418,medium,1.0
419,bv,1.0


## 1.13 Inverse Document Frequency
**Inverse document frequency (IDF) is tries to prove that a word is not of much use if it is appearing in all the documents**

IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present. 

In [27]:
#The more the value of IDF, the more unique is the word.
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(text.shape[0]/(len(text[text['paragraph'].str.contains(word)])))

tf1.head()

Unnamed: 0,words,tf,idf
0,agree,2.0,3.610918
1,privacy,2.0,3.610918
2,cookie,2.0,3.610918
3,policy,2.0,3.610918
4,continuing,1.0,3.610918


### 1.14 Term Frequency – Inverse Document Frequency (TF-IDF)


In [28]:
#TF-IDF is the multiplication of the TF and IDF which we calculated above
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,agree,2.0,3.610918,7.221836
1,privacy,2.0,3.610918,7.221836
2,cookie,2.0,3.610918,7.221836
3,policy,2.0,3.610918,7.221836
4,continuing,1.0,3.610918,3.610918
...,...,...,...,...
416,hosted,1.0,3.610918,3.610918
417,cns,1.0,2.917771,2.917771
418,medium,1.0,3.610918,3.610918
419,bv,1.0,3.610918,3.610918


In [29]:
# There is no need to calculate TF and IDF every time beforehand and then multiply it to obtain TF-IDF. 
# Instead, sklearn has a separate function to directly obtain it: 

tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
stop_words= 'english',ngram_range=(1,1))
text_vect = tfidf.fit_transform(text['paragraph'])

text_vect

<37x387 sparse matrix of type '<class 'numpy.float64'>'
	with 565 stored elements in Compressed Sparse Row format>

In [30]:
print(text_vect)

  (0, 264)	0.4588314677411236
  (0, 80)	0.4588314677411236
  (0, 274)	0.4588314677411236
  (0, 19)	0.4588314677411236
  (0, 332)	0.2294157338705618
  (0, 49)	0.2294157338705618
  (0, 77)	0.2294157338705618
  (1, 363)	0.18989805657191236
  (1, 377)	0.18989805657191236
  (1, 247)	0.18989805657191236
  (1, 326)	0.18989805657191236
  (1, 56)	0.18989805657191236
  (1, 193)	0.14578485564351867
  (1, 286)	0.18989805657191236
  (1, 174)	0.18989805657191236
  (1, 267)	0.18989805657191236
  (1, 166)	0.18989805657191236
  (1, 334)	0.18989805657191236
  (1, 359)	0.18989805657191236
  (1, 224)	0.12315736001057263
  (1, 156)	0.18989805657191236
  (1, 316)	0.17037765419171577
  (1, 24)	0.18989805657191236
  (1, 347)	0.29156971128703735
  (1, 325)	0.18989805657191236
  :	:
  (35, 355)	0.4812126319882611
  (35, 374)	0.24060631599413054
  (35, 290)	0.24060631599413054
  (35, 336)	0.24060631599413054
  (35, 353)	0.24060631599413054
  (35, 32)	0.24060631599413054
  (35, 38)	0.24060631599413054
  (35, 339)

## 1.15 Bag of Words
**This is the representation of text which describes the presence of words within the text data. The idea behThis helps in understanding the meaning of the document.**

In [31]:
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
text_bow = bow.fit_transform(text['paragraph'])
text_bow

<37x420 sparse matrix of type '<class 'numpy.int64'>'
	with 612 stored elements in Compressed Sparse Row format>

In [32]:
print(text_bow)

  (0, 82)	1
  (0, 52)	1
  (0, 357)	1
  (0, 20)	2
  (0, 297)	2
  (0, 85)	2
  (0, 287)	2
  (1, 2)	1
  (1, 99)	1
  (1, 4)	1
  (1, 136)	1
  (1, 389)	1
  (1, 48)	1
  (1, 134)	1
  (1, 398)	1
  (1, 246)	1
  (1, 254)	1
  (1, 46)	1
  (1, 348)	1
  (1, 373)	2
  (1, 25)	1
  (1, 339)	1
  (1, 406)	1
  (1, 165)	1
  (1, 238)	1
  :	:
  (35, 364)	1
  (35, 41)	1
  (35, 34)	1
  (35, 380)	1
  (35, 361)	1
  (35, 313)	1
  (35, 405)	1
  (35, 382)	2
  (35, 333)	1
  (35, 291)	1
  (35, 326)	1
  (35, 24)	1
  (36, 109)	1
  (36, 210)	1
  (36, 301)	1
  (36, 207)	1
  (36, 161)	1
  (36, 221)	1
  (36, 305)	1
  (36, 284)	1
  (36, 191)	1
  (36, 68)	1
  (36, 240)	1
  (36, 56)	1
  (36, 31)	1


# 2 Final Sentimental Analysis
**Concluding the problem of detecting the sentiment of the article and before applying any ML/DL models let’s check the sentiment of the first few paragraphs.**

In [33]:
text['paragraph'].apply(lambda x: TextBlob(x).sentiment)

0                                       (0.0, 0.0)
2       (0.13333333333333333, 0.35000000000000003)
4                                       (0.4, 1.0)
5      (-0.03571428571428571, 0.12142857142857143)
6                                       (0.5, 0.5)
7                       (0.05000000000000002, 0.5)
8                       (0.0, 0.06666666666666667)
9       (0.08522727272727272, 0.31603535353535356)
10       (0.16666666666666666, 0.3333333333333333)
11       (0.06666666666666667, 0.1416666666666667)
12       (0.14285714285714285, 0.2357142857142857)
13       (0.03333333333333333, 0.6333333333333333)
14                                      (0.0, 0.0)
15                                    (0.0, 0.125)
16     (-0.023809523809523808, 0.4047619047619047)
17       (0.06999999999999999, 0.3866666666666666)
18                   (-0.075, 0.15833333333333333)
19     (-0.07428571428571429, 0.32285714285714284)
20                                    (0.125, 0.5)
21                             

**Above, we should only extract polarity as it indicates the sentiment as value nearer to 1 means a
positive sentiment and values nearer to -1 means a negative sentiment. This can also work as
a feature for building a machine learning model.**

In [40]:
text['sentiment'] = text['paragraph'].apply(lambda x: TextBlob(x).sentiment[0] )
text[['paragraph','sentiment']].tail()

Unnamed: 0,paragraph,sentiment
35,inflation one significant topic 2023 get under...,0.4375
37,partnership agriculture organization united na...,0.0
39,china relax testing imported frozen chilled pr...,0.4
41,2023 begin continue attempt tame inflation spa...,-0.108333
46,foodingredientsfirstcom leading international ...,0.0


## 2.1 Final Score

In [41]:
#We calcule the mean to check if our article is positive of negative, in this case it is positive
text['sentiment'].mean()

0.08674037674037674

### Conclusion: as the result is between -1 and 1 and it is greather than 0, we conclude the article is POSITIVE