In [1]:
import pandas as pd
import numpy as np
from collections import Counter as ctr
from operator import itemgetter

In [2]:
train = pd.read_csv('pnp-train.txt',delimiter='\t',encoding='latin-1', names=['type','name'])
train['name'] = train.name.map(lambda x: x.lower().split())
train[:10]

Unnamed: 0,type,name
0,drug,[dilotab]
1,movie,"[beastie, boys:, live, in, glasgow]"
2,person,"[michelle, ford-eriksson]"
3,place,[ramsbury]
4,place,"[market, bosworth]"
5,drug,"[cyanide, antidote, package]"
6,person,"[bill, johnson]"
7,place,[ettalong]
8,movie,"[the, suicide, club]"
9,place,[pézenas]


### Probability of type

In [3]:
smooth = 0.00001
train_ctr = ctr(train.type)
def Pt(T=''):
    return train_ctr[T]/len(train)

In [4]:
Pt(T='person') + Pt(T='place') + Pt(T='movie') + Pt(T='drug') + Pt(T='company')

1.0

In [5]:
all_words_ctr = ctr([word for row in train.name for word in row])
#sum(all_words_ctr.values())

### Probability of a word in name

In [6]:
def Pn(N=''):
    if N not in all_words_ctr: return smooth
    return all_words_ctr[N]/sum(all_words_ctr.values())

In [7]:
Pn(N='bill')

0.0002544828126223475

### Probability of name given type

In [8]:
type_dict = {}
for row in set(train.type):
    temp_df = train[train.type == row]
    counted = ctr([word for row in temp_df.name for word in row])
    type_dict[row] = counted

def Pnt(N='',T=''):
    if N not in type_dict[T]: return smooth
    return type_dict[T][N] / train_ctr[T]

In [9]:
Pnt(N='michelle',T='person')

0.0005213764337851929

### Probability of type given name

In [10]:
def Ptn(T='',N=''):
    return Pnt(N,T) * Pt(T) / Pn(N)

In [11]:
Ptn(T='person', N='Billy')

0.18265796866815867

### Probability of type given name

In [12]:
def Pts(T='', S=[]):
    prob = 1
    for element in S:
        prob = prob*Ptn(T,element)
    return prob

### Testing

In [13]:
test = pd.read_csv('pnp-test.txt',delimiter='\t',encoding='latin-1', names=['type','name'])
test['name'] = test.name.map(lambda x: x.lower().split())
test[:5]

Unnamed: 0,type,name
0,movie,"[the, natalie, cole, story]"
1,place,[zwanenburg]
2,movie,[bootmen]
3,movie,"[the, prince, and, the, pauper]"
4,person,"[vic, willis]"


In [14]:
all_words_in_name = list(set([word for row in train.name for word in row]))

guesses = []
for phrase in test['name']:
    guess = [(t,Pts(t, phrase)) for t in list(set(train.type))]
    best_guess = sorted(guess, key=itemgetter(1))[-1][0]
    guesses.append(best_guess)

In [15]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test.type, guesses)

accuracy

0.675047619047619

In [16]:
guesses_ctr = ctr(guesses)
guesses_ctr

Counter({'movie': 1506,
         'person': 390,
         'company': 359,
         'drug': 327,
         'place': 43})

## Answer the following questions in markdown cells:

### What is the accuracy of your classifier?

67.54%

### What is the random baseline for this task? What is the most common baseline for this task? Is the classifier working well when compared to these baselines?

most common baseline is movie.

### What independence assumptions does your classifier make?

the naive part of it assumes that my 'variables' are independent

### Identify three possible things you could try in order to improve your results.

### What constitutes "training" for this classifier? Is the classifier actually "learning" anything?

Using the data from the train dataframe... This means the counts of every word in name, type, etc. I believe it would count as learning since it can somewhat predict from the data given

### Why would changing your smoothing value change the results?

because the smoothing value accounts for the words in the train/test set that are not in the 'type'

In [53]:
from client.api.notebook import Notebook
ok = Notebook('me.ok')
ok.auth(inline=True, force=True)

Assignment: NLP S19 Midterm Exam
OK, version v1.13.11


Open the following URL:

https://okpy.org/client/login/

After logging in, copy the code from the web page and paste it into the box.
Then press the "Enter" key on your keyboard.

Paste your code here: VJde891nqRsVCuhuO3u55wXsdRxhgO
Successfully logged in as emanuelhernandez@u.boisestate.edu


In [54]:
ok.submit()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saving notebook... Saved 'ME-nbayes.ipynb'.
Submit... 100% complete
Submission successful for user: emanuelhernandez@u.boisestate.edu
URL: https://okpy.org/boisestate/cs4-533/sp19/me/submissions/vll8mV

