In [1]:
import numpy
import urllib.request
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

#nltk.download("stopwords")
from nltk.corpus import stopwords

def parseData(fname):
    with open(fname, encoding='utf-8', errors='ignore') as f:
        data = f.read()
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    data = tokenizer.tokenize(data)
    return data

stopWords = set(stopwords.words("english"))


print("Reading data...")
data = parseData('T1.txt')
print(data)
print("done")

Reading data...
['Transcript', '01', 'P', ':', 'Today', 'is', '____.', 'We', 'are', 'doing', 'session', '4.', 'Is', 'that', 'right', '?', 'C', ':', 'Yeah', 'P', ':', 'Session', '4.', 'CBSST.', 'Social', 'skills', 'training.', 'This', 'is', 'GPxxx', 'with', '[', 'name', ']', '.', 'Okay.', 'What', 'page', 'is', 'that', 'on', 'there', '?', 'C', ':', '[', 'unintelligible', ']', 'P', ':', 'Okay.', 'Making', 'positive', 'requests.', 'Is', 'that', 'right', '?', 'Session', '4', '?', 'C', ':', 'Uh', 'hum.', 'P', ':', 'Okay.', 'So', ',', 'um', ',', 'what', 'do', 'we', 'do', 'first', '?', 'Set', 'an', 'agenda', '?', 'C', ':', 'Yeah.', 'We', 'read', 'this.', 'I', 'read', 'this.', 'P', ':', 'Uh', 'hum.', 'Youre', 'gon', 'na', 'to', 'review', 'your', 'at-home', 'practice.', 'Were', 'gunna', 'review', 'prior', 'communication', 'skills.', 'Were', 'gon', 'na', 'learn', 'making', 'positive', 'requests.', 'Uh', ',', 'talk', 'about', 'at-home', 'practice', 'and', 'then', 'do', 'you', 'want', 'to', 'add', 

In [2]:
### Ignore capitalization and remove punctuation

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
    w = stemmer.stem(d.lower())
    if not w in stopWords:
        wordCount[w] += 1
print(len(data))
print(len(wordCount))
print(wordCount)

7619
599
defaultdict(<class 'int'>, {'transcript': 1, '01': 1, 'p': 266, ':': 524, 'today': 6, '____.': 1, 'session': 6, '4.': 2, 'right': 15, '?': 250, 'c': 267, 'yeah': 21, 'cbsst.': 1, 'social': 1, 'skill': 10, 'training.': 1, 'thi': 20, 'gpxxx': 1, '[': 36, 'name': 26, ']': 36, '.': 8, 'okay.': 66, 'page': 3, 'unintellig': 6, 'make': 86, 'posit': 61, 'requests.': 4, '4': 1, 'uh': 30, 'hum.': 5, ',': 245, 'um': 14, 'first': 8, 'set': 1, 'agenda': 2, 'yeah.': 40, 'read': 3, 'this.': 3, 'gon': 4, 'na': 4, 'review': 6, 'at-hom': 5, 'practice.': 4, 'gunna': 1, 'prior': 1, 'commun': 12, 'skills.': 5, 'learn': 17, 'talk': 13, 'practic': 44, 'want': 34, 'add': 1, 'anyth': 5, 'palm': 1, 'pilot.': 1, 'wa': 25, 'someth': 14, 'suppos': 3, 'oh': 18, 'no.': 6, 'iti': 1, 'somebodi': 11, 'else.': 1, 'huh.': 6, 'look': 6, 'it.': 11, 'gave': 1, 'back': 3, 'alreadi': 2, 'p.': 2, 'dont': 7, 'need': 4, 'discuss': 1, 'alright.': 8, 'let': 25, 'abl': 5, 'complet': 2, 'did.': 3, 'describ': 2, 'situation.'

In [3]:
# nomalization
sorted_wordCount = sorted(wordCount.items(), key = lambda kv: kv[1])
for key, value in sorted_wordCount:
    value = value/len(data)
    print ("%s: %s" %(key, value))


transcript: 0.000131250820317627
01: 0.000131250820317627
____.: 0.000131250820317627
cbsst.: 0.000131250820317627
social: 0.000131250820317627
training.: 0.000131250820317627
gpxxx: 0.000131250820317627
4: 0.000131250820317627
set: 0.000131250820317627
gunna: 0.000131250820317627
prior: 0.000131250820317627
add: 0.000131250820317627
palm: 0.000131250820317627
pilot.: 0.000131250820317627
iti: 0.000131250820317627
else.: 0.000131250820317627
gave: 0.000131250820317627
discuss: 0.000131250820317627
station: 0.000131250820317627
daytime.: 0.000131250820317627
station.: 0.000131250820317627
background: 0.000131250820317627
nois: 0.000131250820317627
coupl: 0.000131250820317627
coupleit: 0.000131250820317627
funni: 0.000131250820317627
ivan.: 0.000131250820317627
bu: 0.000131250820317627
stop: 0.000131250820317627
away.: 0.000131250820317627
away: 0.000131250820317627
now.: 0.000131250820317627
humm: 0.000131250820317627
decently.: 0.000131250820317627
prioriti: 0.000131250820317627
volume

In [4]:
# plot pdf
import matplotlib.pyplot as plt
k, y = zip(*sorted_wordCount)
x = range(0,len(wordCount))
plt.plot(x, y)
plt.xlabel('number')
plt.ylabel('counts')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
fig.savefig('test2png.png', dpi=100)
fig.set_size_inches(18.5, 10.5, forward=True)
plt.show()

<Figure size 1850x1050 with 1 Axes>

In [5]:
#tokenize by sentence
def sentData(fname):
    with open(fname, encoding='utf-8', errors='ignore') as f:
        data = f.read()
    from nltk.tokenize import sent_tokenize
    sentData = sent_tokenize(data)
    return sentData

print("Reading data...")
data = sentData('T1.txt')
print(data)
print("done")

Reading data...
['Transcript 01\n\nP: Today is ____.', 'We are doing session 4.', 'Is that right?', 'C: Yeah\nP: Session 4.', 'CBSST.', 'Social skills training.', 'This is GPxxx with [name].', 'Okay.', 'What page is that on there?', 'C: [unintelligible]\nP: Okay.', 'Making positive requests.', 'Is that right?', 'Session 4?', 'C: Uh hum.', 'P: Okay.', 'So, um, what do we do first?', 'Set an agenda?', 'C: Yeah.', 'We read this.', 'I read this.', 'P: Uh hum.', 'Youre gonna to review your at-home practice.', 'Were gunna review prior communication skills.', 'Were gonna learn making positive requests.', 'Uh, talk about at-home practice and then do you want to add anything to the agenda?', 'C: No\nP: [unintelligible] palm pilot.', 'Was that something we were supposed to talk about?', 'C: Oh, no.', 'It, it, it, it, itI had that with somebody else.', 'P: Uh huh.', 'C: [unintelligible] looks for it.', 'P: You gave it back already?', 'C: Yeah.', 'P. Okay.', 'We dont need to discuss that then?', '

In [6]:
# segmentation by line
def lineData(fname):
    data = []
    with open(fname, encoding='utf-8', errors='ignore') as f:
        for line in f:
            data.append(line.lower())
    return data

print("Reading data...")
data = lineData('T1.txt')
print(data)
print("done")


Reading data...
['transcript 01\n', '\n', 'p: today is ____. we are doing session 4. is that right?\n', 'c: yeah\n', 'p: session 4. cbsst. social skills training. this is gpxxx with [name]. okay. what page is that on there?\n', 'c: [unintelligible]\n', 'p: okay. making positive requests. is that right? session 4?\n', 'c: uh hum.\n', 'p: okay. so, um, what do we do first? set an agenda?\n', 'c: yeah. we read this. i read this.\n', 'p: uh hum. youre gonna to review your at-home practice. were gunna review prior communication skills. were gonna learn making positive requests. uh, talk about at-home practice and then do you want to add anything to the agenda?\n', 'c: no\n', 'p: [unintelligible] palm pilot. was that something we were supposed to talk about?\n', 'c: oh, no. it, it, it, it, iti had that with somebody else.\n', 'p: uh huh.\n', 'c: [unintelligible] looks for it.\n', 'p: you gave it back already? \n', 'c: yeah.\n', 'p. okay. we dont need to discuss that then?\n', 'c: no.\n', 'p:

In [7]:
def sentLine(line):
    from nltk.tokenize import sent_tokenize
    sentData = sent_tokenize(line)
    return sentData

for line in data:
    if not (line[0] == 'p' or line[0] == 'c'):
        print(line)
    else:
        print(sentLine(line))
        


transcript 01



['p: today is ____.', 'we are doing session 4. is that right?']
['c: yeah']
['p: session 4. cbsst.', 'social skills training.', 'this is gpxxx with [name].', 'okay.', 'what page is that on there?']
['c: [unintelligible]']
['p: okay.', 'making positive requests.', 'is that right?', 'session 4?']
['c: uh hum.']
['p: okay.', 'so, um, what do we do first?', 'set an agenda?']
['c: yeah.', 'we read this.', 'i read this.']
['p: uh hum.', 'youre gonna to review your at-home practice.', 'were gunna review prior communication skills.', 'were gonna learn making positive requests.', 'uh, talk about at-home practice and then do you want to add anything to the agenda?']
['c: no']
['p: [unintelligible] palm pilot.', 'was that something we were supposed to talk about?']
['c: oh, no.', 'it, it, it, it, iti had that with somebody else.']
['p: uh huh.']
['c: [unintelligible] looks for it.']
['p: you gave it back already?']
['c: yeah.']
['p. okay.', 'we dont need to discuss that then?']
[

In [8]:
for line in data:
    if not (line[0] == 'p' or line[0] == 'c'):
        print(line)

transcript 01



[laughter]

[laughter]

[recording end]



In [10]:
#dictionary classification
#dic1 = emoj
#dic2 = domain keyword
from PyDictionary import PyDictionary
dic = PyDictionary("good")
print(dic.getSynonyms())
#dataset labelled by person

good has no Synonyms in the API
[None]
