In [1]:
import json
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [2]:
# Load the json file
with open('tweets.json') as jfile:
    tweets = json.load(jfile)


In [3]:
# Convert json to dataframe
df = pd.DataFrame.from_dict(tweets, orient='index')

In [4]:
df.head(5)

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [5]:
# Define stop words
stop_words = stopwords.words("english")

In [6]:
stop_words.append("http")

In [7]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
# Define lemmatizer
lemmatizer = WordNetLemmatizer()

In [9]:
# Define preprocessing function
def preprocess(tweet):
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    # Remove stopwords and lemmatize the tokens
    filtered_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    filtered_text = " ".join(filtered_tokens)
    final = [i for i in filtered_text if i not in string.punctuation]
    final_text = "".join(final)
    return final_text

In [10]:
df["preprocessed_text"] = df["tweet_text"].apply(preprocess)

In [11]:
df.head(5)

Unnamed: 0,tweet_author,tweet_text,preprocessed_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,⚕️ scientist conducted phase ii study acalabru...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,phase 2 acalabrutinibvenetoclax av trial sti...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,nice back astrazenecas calquence cll http ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...,acalabrutinib valuable option pt intolerant ...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,nice recommended use acalabrutinib patient tre...


In [12]:
l =[]
for i in range(len(df["preprocessed_text"])):
    tokens = word_tokenize(df["preprocessed_text"].iloc[i])
    l.extend(tokens)

In [13]:
l

['⚕️',
 'scientist',
 'conducted',
 'phase',
 'ii',
 'study',
 'acalabrutinib',
 'patient',
 'relapsedrefractory',
 'cll',
 'ibrutinibintolerant',
 'found',
 'overall',
 'response',
 'rate',
 '73',
 'http',
 'tcoej6m4qpc5p',
 'http',
 'tcokuzz6zo47r',
 'phase',
 '2',
 'acalabrutinibvenetoclax',
 'av',
 'trial',
 'still',
 'recruitment',
 'phase',
 'study',
 'well',
 'venetoclax',
 'acalabrutinib',
 'work',
 'mcl',
 'patient',
 'either',
 'relapsed',
 'nonrespondent',
 'initial',
 'therapy',
 'http',
 'tcogg0g9at23n',
 'nice',
 'back',
 'astrazenecas',
 'calquence',
 'cll',
 'http',
 'tcovb5lpdogra',
 'acalabrutinib',
 'valuable',
 'option',
 'pt',
 'intolerant',
 'ibrutinib',
 'valuable',
 'data',
 'help',
 'decision',
 'making',
 'cll',
 'early',
 'view',
 'haematologica',
 'http',
 'tcoz2kclzax0d',
 'nice',
 'recommended',
 'use',
 'acalabrutinib',
 'patient',
 'treatmentnaïve',
 'chronic',
 'lymphocytic',
 'leukemia',
 'find',
 'http',
 'tco6oujptlcin',
 'lymsm',
 'lymphoma',
 'nice

In [14]:
entities_freq = pd.Series(l).value_counts()

In [15]:
entities_freq

http             46995
cll              28689
leukemia         19786
lymphocytic      17378
chronic          17101
                 ...  
tcofawzr5yh0o        1
tcollbkijo8un        1
gallon               1
tcorlwuh8wznu        1
tcoihcjylt0xv        1
Length: 85374, dtype: int64

In [16]:
a = pd.Series(entities_freq)
b = a.reset_index()
b = b.rename({"index":"entities",0:"frequency"},axis=1)
b.sort_values("frequency",ascending=False,inplace=True)
b.reset_index(drop=True,inplace=True)

In [17]:
b

Unnamed: 0,entities,frequency
0,http,46995
1,cll,28689
2,leukemia,19786
3,lymphocytic,17378
4,chronic,17101
...,...,...
85369,tcoqjnqs3nhhp,1
85370,tcod36vvtvs3y,1
85371,tcolpl9ln9iqn,1
85372,tcotjsye55rt1,1


In [18]:
import numpy as np

In [19]:
b1 = b[b["frequency"]>100]

In [20]:
b1

Unnamed: 0,entities,frequency
0,http,46995
1,cll,28689
2,leukemia,19786
3,lymphocytic,17378
4,chronic,17101
...,...,...
823,specialist,102
824,prescription,101
825,los,101
826,caregiver,101


In [21]:
b1.to_csv("objective1.csv")

In [22]:
max_freq_words = b1["entities"].values

In [23]:
max_freq = max_freq_words.tolist()

In [24]:
max_freq

['http',
 'cll',
 'leukemia',
 'lymphocytic',
 'chronic',
 'patient',
 'treatment',
 'new',
 'cancer',
 'lymphoma',
 'leusm',
 'amp',
 'ibrutinib',
 'acalabrutinib',
 'cell',
 'dr',
 'therapy',
 'trial',
 'calquence',
 'venetoclax',
 'study',
 's',
 'drug',
 'hematology',
 '’',
 'fda',
 'clinical',
 'relapsed',
 'article',
 'phase',
 'via',
 'data',
 'lymsm',
 'astrazeneca',
 'w',
 'expert',
 'research',
 'inhibitor',
 'covid19',
 'learn',
 'video',
 'combination',
 'leukaemia',
 'adult',
 'cllsm',
 'result',
 'blood',
 'rituximab',
 'discus',
 'disease',
 'oncology',
 'u',
 'btk',
 'today',
 'option',
 'de',
 'response',
 'show',
 'review',
 'mantle',
 'news',
 'md',
 'bcell',
 'obinutuzumab',
 'get',
 'plus',
 'may',
 '3',
 'refractory',
 'relapsedrefractory',
 'survival',
 'watch',
 'approves',
 'risk',
 'year',
 'targeted',
 'pt',
 'approved',
 'mutation',
 'see',
 'medivizor',
 'novel',
 'outcome',
 'bloodcancer',
 'help',
 '2',
 'treated',
 'read',
 'great',
 'approval',
 'live',

In [25]:
def second_preprocess(tweet):
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    # Remove stopwords and lemmatize the tokens
    filtered_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if (token.lower() not in stop_words) and (token.lower()in max_freq)]
    filtered_text = " ".join(filtered_tokens)
    final = [i for i in filtered_text if i not in string.punctuation]
    final_text = "".join(final)
    return final_text

In [26]:
df["final_preprocess"] = df["tweet_text"].apply(second_preprocess)

In [27]:
df["final_preprocess"]

1374140386071961602    phase ii study acalabrutinib cll found overall...
1374032432173842437    phase 2 trial still phase study well venetocla...
1373902876553048065                                   nice calquence cll
1373656782367813635    acalabrutinib option ibrutinib data help decis...
1372941634334232586    nice recommended use acalabrutinib chronic lym...
                                             ...                        
551103473643945985                                                   cll
551102786675290112                                                   cll
550969541186953217                                        idelalisib cll
550941480525635584       expression outcome chronic lymphocytic leukemia
550579446537678849         pathway identify chronic lymphocytic leukemia
Name: final_preprocess, Length: 43347, dtype: object

In [28]:
df.to_csv("before_result.csv")

In [29]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [30]:
sia = SentimentIntensityAnalyzer()

In [31]:
df['final_preprocess_tokens'] = df['final_preprocess'].apply(lambda x: x.split())

In [32]:
df1=df[['tweet_author']]


In [33]:
#df1 = df1.explode('final_preprocess_tokens')

In [34]:
df1

Unnamed: 0,tweet_author
1374140386071961602,Hematopoiesis News
1374032432173842437,"Michael Wang, MD"
1373902876553048065,1stOncology
1373656782367813635,Toby Eyre
1372941634334232586,Lymphoma Hub
...,...
551103473643945985,Joy is a Lifestyle
551102786675290112,𝓒𝓻𝓲𝔃𝔃𝔂 𝓟𝓮𝓻𝓻𝔂🌹
550969541186953217,IQWiG
550941480525635584,Medibooks


In [35]:
from nltk.util import ngrams

In [36]:
def extract_ngrams(x):
    tokens = word_tokenize(x)
    n = 2  # set the value of n for the n-grams
    bigrams = ngrams(tokens, n)
    return list(bigrams)

In [37]:
df1["result"] = df["final_preprocess"].apply(extract_ngrams)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["result"] = df["final_preprocess"].apply(extract_ngrams)


In [38]:
df1=df1.explode("result")

In [39]:
print(df1['result'].dtype)

object


In [40]:
df1

Unnamed: 0,tweet_author,result
1374140386071961602,Hematopoiesis News,"(phase, ii)"
1374140386071961602,Hematopoiesis News,"(ii, study)"
1374140386071961602,Hematopoiesis News,"(study, acalabrutinib)"
1374140386071961602,Hematopoiesis News,"(acalabrutinib, cll)"
1374140386071961602,Hematopoiesis News,"(cll, found)"
...,...,...
550941480525635584,Medibooks,"(lymphocytic, leukemia)"
550579446537678849,Medibooks,"(pathway, identify)"
550579446537678849,Medibooks,"(identify, chronic)"
550579446537678849,Medibooks,"(chronic, lymphocytic)"


In [41]:
from textblob import TextBlob

# Define a function to get the sentiment
def get_sentiment(tweet):
    return TextBlob(tweet).sentiment.polarity



In [42]:
#Check for NaN values
print(df1["result"].isnull().sum())

#drop NaN values
df1 = df1.dropna(subset=["result"])

#Convert the column to string
df1["result"] = df1["result"].astype(str)

#Apply the function to get the sentiment
df1["sentiment"] = df1["result"].apply(get_sentiment)





1445


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["result"] = df1["result"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["sentiment"] = df1["result"].apply(get_sentiment)


In [43]:
df1["sentiment"] = df1["sentiment"].apply(lambda x: "positive" if x > 0 else "negative" if x < 0 else "neutral")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["sentiment"] = df1["sentiment"].apply(lambda x: "positive" if x > 0 else "negative" if x < 0 else "neutral")


In [44]:
df1

Unnamed: 0,tweet_author,result,sentiment
1374140386071961602,Hematopoiesis News,"('phase', 'ii')",neutral
1374140386071961602,Hematopoiesis News,"('ii', 'study')",neutral
1374140386071961602,Hematopoiesis News,"('study', 'acalabrutinib')",neutral
1374140386071961602,Hematopoiesis News,"('acalabrutinib', 'cll')",neutral
1374140386071961602,Hematopoiesis News,"('cll', 'found')",neutral
...,...,...,...
550941480525635584,Medibooks,"('lymphocytic', 'leukemia')",neutral
550579446537678849,Medibooks,"('pathway', 'identify')",neutral
550579446537678849,Medibooks,"('identify', 'chronic')",neutral
550579446537678849,Medibooks,"('chronic', 'lymphocytic')",neutral


In [45]:
# Save the results to a csv file
df1[["result","tweet_author","sentiment"]].to_csv("objective2.csv", index=False)