In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [50]:
df = pd.read_csv("a3_train_round1.tsv",sep='\t',header = None)

df.rename(columns={0:'Labels',1:'Comments'}, inplace=True)
df.fillna("")

# Check if the data set is balanced
df['Labels'].value_counts() 


0    4867
1    4755
Name: Labels, dtype: int64

In [3]:
df.head()

Unnamed: 0,Labels,Comments
0,0,I won't get vaccinated because it's business f...
1,0,This vaccine is not determine as the solution ...
2,0,I haven't had a vaccine since I was a young ch...
3,1,It’s weird how when a virus has a 3% chance of...
4,1,The only side effect after receiving my COVID-...


In [74]:
# GET A TRAIN TEST SPLIT
training_data, testing_data = train_test_split(df,test_size=0.25, random_state = 2000)

# GET LABELS
Y_train=training_data['Labels'].values
Y_test=testing_data['Labels'].values
     
# GET FEATURES
field = "Comments"
X_train,X_test,feature_transformer=extract_features(df, field, training_data,testing_data)

In [65]:
def extract_features(df, field, training_data, testing_data):
    """Extract features using different methods"""
         
    # BINARY FEATURE REPRESENTATION
    cv= CountVectorizer(binary=True, max_df=0.95)
    cv.fit_transform(training_data[field].values)

    train_feature_set=cv.transform(training_data[field].values)
    test_feature_set=cv.transform(testing_data[field].values)

    return train_feature_set,test_feature_set,cv

In [66]:

scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
model=scikit_log_reg.fit(X_train,Y_train)

[LibLinear]

In [71]:
# GET TOP K PREDICTIONS
preds=get_top_k_predictions(model,X_test,3)
    
# GET PREDICTED VALUES AND GROUND TRUTH INTO A LIST OF LISTS
#eval_items=collect_preds(Y_test,preds)
    
# GET EVALUATION NUMBERS ON TEST SET -- HOW DID WE DO?

#accuracy=compute_accuracy(eval_items)
#mrr_at_k=compute_mrr_at_k(eval_items)

In [68]:
def get_top_k_predictions(model,X_test,k):
    
    # get probabilities instead of predicted labels, since we want to collect top 3
    probs = model.predict_proba(X_test)

    # GET TOP K PREDICTIONS BY PROB - note these are just index
    best_n = np.argsort(probs, axis=1)[:,-k:]

    # GET CATEGORY OF PREDICTIONS
    preds=[[model.classes_[predicted_cat] for predicted_cat in prediction] for prediction in best_n]

    # REVERSE CATEGORIES - DESCENDING ORDER OF IMPORTANCE
    preds=[ item[::-1] for item in preds]

    return preds

In [68]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 

def tokenizer(text):

    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]

    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent

    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
     
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

    return filtered_tokens


def tokenize_data(dataset):
    stop_words = set(stopwords.words('english'))
    dataset["Comments"] = ([token.lower() for token in dataset["Comments"] if token not in stop_words])
    
    dataset["tokenized_sents"] = dataset["Comments"].fillna("").map(nltk.word_tokenize)
    #dataset["Tokens"] = dataset.Comments.apply(lambda row: nltk.word_tokenize(row["Comments"]))
    print(dataset)    
    return dataset


def remove_stop_words(dataset):
    stop_words = set(stopwords.words('english'))
    
    #for i in range(dataset.shape[0]):
    dataset["Comments"] = ([token.lower() for token in dataset["Comments"] if token not in stop_words])
    
    return dataset


def normalize(dataset):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    for i in range(dataset.shape[0]):
        dataset.Comments[i] = " ".join([lemmatizer.lemmatize(token) for token in dataset.Comments[i]]).strip()
    return dataset

def remove_garbage(dataset):
    garbage = "~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
    for i in range(dataset.shape[0]):
        dataset.Comments[i] = "".join([char for char in dataset.Comments[i] if char not in garbage])
    return dataset

# GET A TRAIN TEST SPLIT
train_data, test_data = train_test_split(df,test_size=0.25, random_state = 2000)

# GET LABELS
Y_train=train_data['Labels'].values
Y_test=test_data['Labels'].values

#print(train_data)
train_data = tokenize_data(train_data)
train_data = remove_stop_words(train_data)
train_data = normalize(train_data)
train_data = remove_garbage(train_data)
test_data = tokenize_data(test_data)
test_data = remove_stop_words(test_data)
test_data = normalize(test_data)
test_data = remove_garbage(test_data)


def fit_corpus(train_data, test_data):
    corpus = pd.DataFrame({"Comments": train_data["Comments"]})
    corpus.reviews.append(test_data["Comments"], ignore_index=True)
    tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2))
    tfidf.fit(corpus["Comments"])
    return tfidf

def transform_data(tfidf, dataset):
    features = tfidf.transform(dataset["Comments"])
    return pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())

tfidf = fit_corpus(train_data, test_data)  #Fitting the vecorizer
train_features = transform_data(tfidf, train_data)  #transforming 
test_features = transform_data(tfidf, test_data)    #Train and Test
train_labels = train_data["labels"]  #Taking lables in separate
test_labels = test_data["labels"]    #variables

ValueError: Length of values does not match length of index

In [10]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stopwords
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [12]:


# Import module
from nltk.tokenize import RegexpTokenizer

part1 = """We are gathered here today on this joyous occasion to celebrate the special love that Monica and Chandler share. It is a love based on giving and receiving as well as having and sharing. And the love that they give and have is shared and received. And
through this having and giving and sharing and receiving, we too can share and love and have... and receive."""
part2 = """When I think of the love these two givers and receivers share I cannot help but envy the lifetime ahead of having and loving and giving and receiving."""

# Create an instance of RegexpTokenizer for alphanumeric tokens
tokeniser = RegexpTokenizer(r'\w+')

# Create a dataframe
X_train = pd.DataFrame([part1, part2], columns=['speech'])

# Tokenise 'part1' string
tokens = tokeniser.tokenize(part1)
print(tokens)

# Import module
from nltk.stem import WordNetLemmatizer
# Create an instance of WordNetLemmatizer
lemmatiser = WordNetLemmatizer()
# Lowercase and lemmatise tokens
lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
print(lemmas)

# Check how many words we have
len(lemmas)


# Import module
from nltk.corpus import stopwords
# Check out how many stop words there are 
print(len(stopwords.words('english')))
# See first 5 stop words
stopwords.words('english')[:5]

keywords = [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
print(keywords)

# Check how many words we have
len(keywords)

{word: keywords.count(word) for word in set(keywords)}

# Import module
from sklearn.feature_extraction.text import CountVectorizer
# Create an instance of CountfVectorizer
vectoriser = CountVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
X_train = vectoriser.fit_transform(X_train['speech'])


# Convert sparse matrix to dataframe
X_train = pd.DataFrame.sparse.from_spmatrix(X_train)
# Save mapping on which index refers to which terms
col_map = {v:k for k, v in vectoriser.vocabulary_.items()}
# Rename each column using the mapping
for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)
X_train


# Import module
from sklearn.feature_extraction.text import TfidfTransformer
# Create an instance of TfidfTransformer
transformer = TfidfTransformer()
# Fit to the data and transform to tf-idf
X_train = pd.DataFrame(transformer.fit_transform(X_train).toarray(), columns=X_train.columns)
X_train


# Import module
from sklearn.feature_extraction.text import TfidfVectorizer
# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to tf-idf
X_train = vectoriser.fit_transform(X_train['speech'])



['We', 'are', 'gathered', 'here', 'today', 'on', 'this', 'joyous', 'occasion', 'to', 'celebrate', 'the', 'special', 'love', 'that', 'Monica', 'and', 'Chandler', 'share', 'It', 'is', 'a', 'love', 'based', 'on', 'giving', 'and', 'receiving', 'as', 'well', 'as', 'having', 'and', 'sharing', 'And', 'the', 'love', 'that', 'they', 'give', 'and', 'have', 'is', 'shared', 'and', 'received', 'And', 'through', 'this', 'having', 'and', 'giving', 'and', 'sharing', 'and', 'receiving', 'we', 'too', 'can', 'share', 'and', 'love', 'and', 'have', 'and', 'receive']
['we', 'be', 'gather', 'here', 'today', 'on', 'this', 'joyous', 'occasion', 'to', 'celebrate', 'the', 'special', 'love', 'that', 'monica', 'and', 'chandler', 'share', 'it', 'be', 'a', 'love', 'base', 'on', 'give', 'and', 'receive', 'as', 'well', 'as', 'have', 'and', 'share', 'and', 'the', 'love', 'that', 'they', 'give', 'and', 'have', 'be', 'share', 'and', 'receive', 'and', 'through', 'this', 'have', 'and', 'give', 'and', 'share', 'and', 'recei

KeyError: 'speech'