In [1]:
import nltk
import pandas as pd
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from collections import Counter

# Load the dataset from a CSV file
df = pd.read_csv('ner_datasetreference.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [2]:
df['clean_text']=df['Word'].str.lower()
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,clean_text
0,Sentence: 1,Thousands,NNS,O,thousands
1,,of,IN,O,of
2,,demonstrators,NNS,O,demonstrators
3,,have,VBP,O,have
4,,marched,VBN,O,marched


In [3]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
def remove_punctuations(text):
    punctuations=string.punctuation
    return text.translate(str.maketrans('','', punctuations))

In [5]:
" ".join(stopwords.words('english'))

"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't"

In [6]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    punctuations=string.punctuation
    return " ".join([word for word in text.split()if word not in STOPWORDS])


In [7]:
df['clean_text']=df['clean_text'].apply(lambda x:  remove_stopwords(x))
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,clean_text
0,Sentence: 1,Thousands,NNS,O,thousands
1,,of,IN,O,
2,,demonstrators,NNS,O,demonstrators
3,,have,VBP,O,
4,,marched,VBN,O,marched


In [8]:
#Removal of frequent words
from collections import Counter
word_count=Counter()
for text in df['clean_text'].values:
    for word in text.split():
        word_count[text]+=1
word_count.most_common(10)

[('.', 47761),
 (',', 32754),
 ("'s", 10923),
 ('said', 5329),
 ('says', 4640),
 ('say', 4178),
 ('u.s.', 4129),
 ('"', 3686),
 ('president', 3396),
 ('officials', 3390)]

In [9]:
FREQUENT_WORDS = set(word for(word,wc)in  word_count.most_common(3))
def  remove_freq_words(text):
    
    return " ".join([word for word in text.split()if word not in FREQUENT_WORDS])

In [10]:
df['clean_text']=df['clean_text'].apply(lambda x:  remove_freq_words(x))
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,clean_text
0,Sentence: 1,Thousands,NNS,O,thousands
1,,of,IN,O,
2,,demonstrators,NNS,O,demonstrators
3,,have,VBP,O,
4,,marched,VBN,O,marched


In [11]:
#remove special character
import re
def  remove_spl_chars(text):
     text = re.sub(r"[^a-zA-Z0-9]", " ", text)
     text = re.sub("\s+", " ", text)
     return text

In [12]:
df['clean_text']=df['clean_text'].apply(lambda x:  remove_spl_chars(x))
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,clean_text
0,Sentence: 1,Thousands,NNS,O,thousands
1,,of,IN,O,
2,,demonstrators,NNS,O,demonstrators
3,,have,VBP,O,
4,,marched,VBN,O,marched


In [13]:
#stemming
from nltk.stem import PorterStemmer
ps= PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])


In [14]:
df['stemmed_text']=df['clean_text'].apply(lambda x:  stem_words(x))
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,clean_text,stemmed_text
0,Sentence: 1,Thousands,NNS,O,thousands,thousand
1,,of,IN,O,,
2,,demonstrators,NNS,O,demonstrators,demonstr
3,,have,VBP,O,,
4,,marched,VBN,O,marched,march


In [15]:
# Lemmatization
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/basmalakhaled/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/basmalakhaled/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
df['lemmatized_words'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,clean_text,stemmed_text,lemmatized_words
0,Sentence: 1,Thousands,NNS,O,thousands,thousand,thousand
1,,of,IN,O,,,
2,,demonstrators,NNS,O,demonstrators,demonstr,demonstrator
3,,have,VBP,O,,,
4,,marched,VBN,O,marched,march,march


In [17]:


# Load the dataset from a CSV file
data = pd.read_csv('ner_datasetreference.csv', encoding='ISO-8859-1')

# Access specific columns
sentences = data['Sentence #'].tolist()
words = data['Word'].tolist()

# Limit the number of sentences for processing
sentences = [str(sentence) for sentence in sentences if isinstance(sentence, str)]
sentences = sentences[:10]

# Initialize a list to store the stemmed sentences
stemmed_sentences = []

# Preprocess the sentences
for sentence in sentences:
    tokens = word_tokenize(sentence)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    stemmed_sentences.append(stemmed_tokens)

# Calculate n-grams for Named Entity Recognition
n = 3
all_ngrams = []
for sentence in stemmed_sentences:
    all_ngrams.extend(ngrams(sentence, n))


# Calculate probabilities of n-grams for Named Entity Recognition
ngram_counter = Counter(all_ngrams)
total_ngrams = sum(ngram_counter.values())
probabilities = {ngram: count / total_ngrams for ngram, count in ngram_counter.items()}

# Add probabilities to the DataFrame
for i, (ngram, prob) in enumerate(probabilities.items()):
    data.loc[i, 'probability'] = prob
    if i >= 9:
        break

# Print the first few rows of the DataFrame
data.head(10)


Unnamed: 0,Sentence #,Word,POS,Tag,probability
0,Sentence: 1,Thousands,NNS,O,0.1
1,,of,IN,O,0.1
2,,demonstrators,NNS,O,0.1
3,,have,VBP,O,0.1
4,,marched,VBN,O,0.1
5,,through,IN,O,0.1
6,,London,NNP,B-geo,0.1
7,,to,TO,O,0.1
8,,protest,VB,O,0.1
9,,the,DT,O,0.1


In [18]:
import pandas as pd
import nltk
from itertools import chain

# Assuming you have a 'Word' column in your DataFrame containing the text data
df = pd.read_csv('ner_datasetreference.csv', encoding='ISO-8859-1')



def calc_ngrams(text, n):
    # Implement your n-gram calculation logic here
    # This function should return a list of n-grams
    # For example, using nltk ngrams:
    ngrams = list(nltk.ngrams(nltk.word_tokenize(text), n))
    return ngrams

def count_ngram(df, n):
    # Assuming 'Word' is the column containing text data
    ngrams = df['Word'].apply(lambda x: calc_ngrams(x, n))
    flattened_ngrams = list(chain.from_iterable(ngrams))
    fd = nltk.FreqDist(flattened_ngrams)
    for k, v in fd.items():
        print(k, v)

def count_unigram(df):
    tokens = df['Word'].apply(nltk.word_tokenize)
    flattened_tokens = list(chain.from_iterable(tokens))
    fd = nltk.FreqDist(flattened_tokens)
    for k, v in fd.items():
        print(k, v)

# Count and print trigrams
count_ngram(df, 3)

# Count and print unigrams
count_unigram(df)


('A', '&', 'M') 1
('https', ':', '//www.celebritiesforcharity.org/raffles/netraffle_main.cfm') 1
('W', '?', 'odzimierz') 1
('Sana', "'", 'a') 1
('AT', '&', 'T') 2
('H', '&', 'M') 1
('#', 'NAME', '?') 1
('R', '&', 'B') 1
('jcfundrzr', '@', 'aol.com') 1
('S', '&', 'P') 2
Thousands 114
of 26354
demonstrators 110
have 5485
marched 65
through 515
London 261
to 23213
protest 237
the 52573
war 720
in 26323
Iraq 1738
and 19936
demand 220
withdrawal 154
British 637
troops 1195
from 4539
that 6301
country 1925
. 56382
Families 6
soldiers 757
killed 2861
conflict 245
joined 116
protesters 197
who 1919
carried 222
banners 11
with 5381
such 408
slogans 36
as 4106
`` 3686
Bush 976
Number 1
One 166
Terrorist 4
Stop 3
Bombings 4
They 572
Houses 2
Parliament 54
a 20482
rally 97
Hyde 2
Park 20
Police 470
put 183
number 461
marchers 7
at 4343
10,000 56
while 576
organizers 22
claimed 281
it 3021
was 4878
1,00,000 36
The 11313
comes 234
on 6710
eve 23
annual 133
conference 240
Britain 335
's 10925
ruling 

cares 1
grief 4
offended 2
knows 7
demeanor 2
al-Hindawi 1
Duluiya 1
Pakistani-Afghan 1
Machinea 1
213 2
Guji 2
Borena 2
Shakiso 2
Arero 2
Yabello 2
Jaatanni 1
Taadhii 1
39,000 2
sided 4
scooter 2
Segway 2
upright 1
two-wheeled 1
Guest 2
one-meter-long 1
gyroscopes 1
tricky 1
scooters 1
Warrantless 1
lowers 2
non-OPEC 1
Bunia 3
Floribert 1
Ndjabu 2
Integrationist 1
precedence 1
Rigoberta 1
Menchu 1
Events 1
rulings 5
Miroslav 4
Bralo 3
Jokers 1
forcers 1
alliance-head 1
top-level 3
Dwain 2
Chambers 8
cheats 1
Mironov 3
Rodina 2
Pensioners 2
Assimilating 1
visually 1
impaired 1
Ancic 5
Ordina 1
Den 1
Bosch 1
Llodra 2
Klara 1
Koukalova 2
countrywoman 2
Ljube 1
Boskovski 4
Ljubotno 1
Skopje 1
Tarulovski 2
pre-2007 1
Malcolm 2
reception 2
Radisson 1
CPPCC 3
Expectations 1
telecast 1
Hizb-e-Islami 1
Homes 1
conferring 2
Singnaghi 1
Neighboring 1
tugboats 1
Mavi 1
Marmaraout 1
commandeered 3
74 4
Neuilly 1
stand-in 2
Andiwal 2
Marja 1
Jalani 1
Levinson 2
Kish 1
locating 2
duffle 1
218-5 1
24