In [1]:
import pandas as pd

In [2]:
from transformers import pipeline

In [3]:
from transformers import AutoModelForTokenClassification

In [4]:
from transformers import AutoTokenizer

In [5]:
file_path = 'reddit_comments_file.csv'

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,parent_id,Author,Comment,Score,Created
0,hpee8lp,t3_rl8ftz,cascadiacomrade,Favourite National Park has got to be Banff/Ja...,15,2021-12-21 07:24:26
1,hpfdt3l,t3_rl8ftz,MapleHamms,"My favourite is Elk Island in Alberta, I spent...",12,2021-12-21 14:24:53
2,hpeshvk,t3_rl8ftz,HowsYourBobber,[Torngats National Park in NL](https://www.new...,11,2021-12-21 10:41:14
3,hpfgt7s,t3_rl8ftz,[deleted],Yoho has two of the most incredible places on ...,8,2021-12-21 14:48:08
4,hpfjqwh,t3_rl8ftz,english_major,"Gwaii Haanas National Park on Haida Gwaii, BC....",5,2021-12-21 15:10:05


In [6]:
checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
model = AutoModelForTokenClassification.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [8]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# input example sentence
example = "Torngats looks really beautiful. It definitely has that offworld feel that a lot of the Far North parks have. That's what Auyuittuq National Park is to me. Super isolated and rarely visited on Baffin, and a lot of planning and resources to make a trip there worth it but it's definitely the Canadian Holy Grail on my bucket list"
#df.Comment[0]


In [9]:
ner_results = nlp(example)

In [10]:
df.Comment[0]

"Favourite National Park has got to be Banff/Jasper. The mountains, lakes, and glaciers are breathtaking along the Icefields Parkway which has got to be one of the most beautiful roads on Earth. There is so much to explore in both parks and so many amazing hikes for all difficulty levels. And of course, everybody loves r/redditlake/\n\nThat being said, if we're including Provincial Parks there are many in BC that (in my opinion) rival Banff/Jasper, albeit without the amenities and easy accessibility since you often have to hike to get to the areas you want to see. Garibaldi Provincial Park takes the cake with the famous Garibaldi Lake, the surrounding Panorama Ridge hike, and nearby Black Tusk for the adventurous. Also views of the potential active volcano Mt. Garibaldi. Plenty of other cool areas like Cheakamus, Wedgemount, and Elfin Lakes."

In [11]:
print(ner_results)

[{'entity': 'B-ORG', 'score': 0.5260769, 'index': 1, 'word': 'Tor', 'start': 0, 'end': 3}, {'entity': 'I-ORG', 'score': 0.64136595, 'index': 2, 'word': '##nga', 'start': 3, 'end': 6}, {'entity': 'B-LOC', 'score': 0.9883427, 'index': 20, 'word': 'Far', 'start': 88, 'end': 91}, {'entity': 'I-LOC', 'score': 0.99209565, 'index': 21, 'word': 'North', 'start': 92, 'end': 97}, {'entity': 'B-LOC', 'score': 0.9738392, 'index': 29, 'word': 'Au', 'start': 122, 'end': 124}, {'entity': 'I-LOC', 'score': 0.96107405, 'index': 30, 'word': '##yu', 'start': 124, 'end': 126}, {'entity': 'I-LOC', 'score': 0.9073081, 'index': 31, 'word': '##itt', 'start': 126, 'end': 129}, {'entity': 'I-LOC', 'score': 0.9326298, 'index': 32, 'word': '##u', 'start': 129, 'end': 130}, {'entity': 'I-LOC', 'score': 0.9606481, 'index': 33, 'word': '##q', 'start': 130, 'end': 131}, {'entity': 'I-LOC', 'score': 0.97804135, 'index': 34, 'word': 'National', 'start': 132, 'end': 140}, {'entity': 'I-LOC', 'score': 0.9789619, 'index':

In [12]:
len(ner_results)

17

In [13]:
def clear_results(results, prob):
  results = [res for res in ner_results if res['score'] > prob]
  for i in reversed(range(len(results))):
    if results[i]['word'].startswith('##') and results[i]['start'] == results[i-1]['end']:
      results[i-1]['word'] = results[i-1]['word'] + results[i]['word'].replace("##","")
      results[i-1]['end'] = results[i]['end']
    elif results[i]['start'] == results[i-1]['end'] + 1:
      results[i-1]['word'] = results[i-1]['word'] + " " + results[i]['word']
      results[i]['word'] = "##"
      results[i-1]['end'] = results[i]['end']
  clean_results = [res for res in results if (res['word'].startswith("##")==False)] # and res['entity'] == 'B-LOC')]
  return clean_results



In [14]:
ner_df = pd.DataFrame()

In [15]:
def add_to_df(clean_results, df):
  for res in clean_results:
    data = [[res['entity'], res['word']]]
    df0 = pd.DataFrame(data, columns=['entity', 'word'])
    df = pd.concat([df,df0],axis = 0)
  return df

In [16]:
ner_df_ex = pd.DataFrame()
clean_results_ex = clear_results(ner_results, 0.5)
ner_df_ex = add_to_df(clean_results_ex, ner_df_ex)

In [17]:
ner_df_ex

Unnamed: 0,entity,word
0,B-ORG,Tornga
0,B-LOC,Far North
0,B-LOC,Auyuittuq National Park
0,B-LOC,Baffin
0,B-MISC,Canadian Holy Grail


In [18]:
for comment in df['Comment']:
  ner_results = nlp(comment)
  clean_results = clear_results(ner_results, 0.3)
  ner_df = add_to_df(clean_results, ner_df)

In [19]:
ner_df.shape


(1112, 2)

In [20]:
ner_df.nunique()

entity      7
word      507
dtype: int64

In [21]:
ner_df = ner_df.reset_index()

In [22]:
ner_df['freq_count'] = ner_df.groupby('word')['word'].transform('count')
print(ner_df)

      index  entity               word  freq_count
0         0   B-LOC                  F           2
1         0   B-LOC       Banff Jasper           4
2         0  B-MISC  Icefields Parkway           2
3         0   B-LOC              Earth           1
4         0   B-LOC   Provincial Parks           1
...     ...     ...                ...         ...
1107      0  B-MISC           American           3
1108      0  B-MISC           Canadian           9
1109      0   B-LOC           Waterton          17
1110      0   B-LOC            Dauphin           1
1111      0   B-LOC           Petawawa           1

[1112 rows x 4 columns]


In [19]:
#ner_df = ner_df.loc[ner_df['freq_count'] > 1]
#print(ner_df)

In [23]:
ner_df = ner_df.drop_duplicates(keep='first')

In [24]:
print(ner_df)

      index  entity                              word  freq_count
0         0   B-LOC                                 F           2
1         0   B-LOC                      Banff Jasper           4
2         0  B-MISC                 Icefields Parkway           2
3         0   B-LOC                             Earth           1
4         0   B-LOC                  Provincial Parks           1
...     ...     ...                               ...         ...
1098      0   B-LOC  Writing on Stone Provincial Park           1
1102      0   B-LOC                        Lake Magog           1
1104      0   I-LOC                           Assinib           2
1110      0   B-LOC                           Dauphin           1
1111      0   B-LOC                          Petawawa           1

[556 rows x 4 columns]


In [25]:
ner_df = ner_df.loc[ner_df.word.str.len() > 2]

In [26]:
ner_df.word

1                           Banff Jasper
2                      Icefields Parkway
3                                  Earth
4                       Provincial Parks
7              Garibaldi Provincial Park
                      ...               
1098    Writing on Stone Provincial Park
1102                          Lake Magog
1104                             Assinib
1110                             Dauphin
1111                            Petawawa
Name: word, Length: 512, dtype: object

In [27]:
ner_df = ner_df.sort_values(by=['freq_count'])

In [28]:
ner_df.shape

(512, 4)

In [29]:
ner_df.head(15)

Unnamed: 0,index,entity,word,freq_count
477,0,B-LOC,Fundy Park,1
690,0,B-LOC,Moncton,1
688,0,B-LOC,Wood Buffalo,1
680,0,B-LOC,Ireland,1
670,0,B-LOC,Midland National Park,1
660,0,B-LOC,Nairn Falls,1
659,0,B-LOC,Pemberton,1
658,0,B-ORG,Whistler,1
656,0,B-LOC,Sea,1
653,0,B-LOC,Point Pelee National Park,1


In [30]:
ner_df.tail(10)

Unnamed: 0,index,entity,word,freq_count
134,0,B-LOC,Ontario,27
17,0,B-LOC,Alberta,31
470,0,B-MISC,Alberta,31
59,0,B-PER,Jasper,35
46,0,B-LOC,Jasper,35
57,0,B-LOC,Canada,35
58,0,B-PER,Banff,46
431,0,B-MISC,Banff,46
259,0,B-ORG,Banff,46
47,0,B-LOC,Banff,46


In [31]:
ner_df = ner_df.drop(columns = ['entity','index'])

In [32]:
ner_df = ner_df.drop_duplicates(keep='first')

In [33]:
ner_df.shape

(470, 2)

In [34]:
ner_df.tail(10)

Unnamed: 0,word,freq_count
385,Pacific Rim,12
393,Waterton,17
405,Algonquin,17
356,Gros Morne,22
119,Yoho,25
134,Ontario,27
17,Alberta,31
59,Jasper,35
57,Canada,35
58,Banff,46


In [35]:
# Specify the file path and name for the CSV file
file_path = 'entity_file.csv'  # Replace with the desired file path and name

# Save the DataFrame to CSV
ner_df.to_csv(file_path)

In [28]:
ner_df3 = ner_df.loc[ner_df.word.str.len() <= 2]

In [29]:
ner_df3 = ner_df3.drop_duplicates(keep='first')

In [30]:
ner_df3.shape

(20, 4)

In [31]:
ner_df3.head(20)

Unnamed: 0,index,entity,word,freq_count
0,0,B-LOC,F,1
4,0,B-LOC,BC,23
10,0,B-LOC,Mt,5
18,0,B-LOC,NL,4
104,0,B-LOC,K,1
113,0,B-LOC,Fr,1
133,0,B-LOC,C,2
172,0,B-LOC,Pa,1
182,0,B-LOC,Ko,4
196,0,B-LOC,Au,1
