In [104]:
import pandas as pd
import numpy as np
from html import unescape
import re
import fasttext

In [105]:
df = pd.read_csv('TestX.csv', sep=';')
train_df = pd.read_csv('Train.csv', sep=';')
df[df.duplicated(subset=['opinion'])]

Unnamed: 0,name,condition,opinion
503,Tamiflu,Influenza,"""Works awesome, put me back on my feet in 2 da..."
543,Epiduo,Acne,"""This creme has done absolutely nothing but ma..."
767,Phentermine / topiramate,Weight Loss,"""Started taking this when I found out I inheri..."
865,Belviq,Weight Loss,"""Started this less than week ago. Only side e..."
933,Miconazole,Vaginal Yeast Infection,"""The pain from this medication is so much wors..."
...,...,...,...
49982,LoSeasonique,Birth Control,"""I started taking LoSeasonique about 3 weeks a..."
49983,Metamucil,Constipation,"""I have suffered with constipation and ibs for..."
49986,Bydureon,"Diabetes, Type 2","""I started 10 weeks ago and saw an immediate d..."
49997,Yasmin,Acne,"""I am 20 and have had terrible skin since I wa..."


In [133]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^a-z 0-9]')
UTF_CODES = re.compile('#\d+')

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = UTF_CODES.sub('', text)
    # We do not remove stop words for LSTM because having a 'not' in a sentence is actually kind of useful
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

def clean_data(df):
    df = df.copy()
    df.update(df[df['opinion'].str.contains('&')]['opinion'].apply(unescape))
    df['opinion'] = df['opinion'].str[1:-1]
    df = df[df['opinion'].str.len() > 2]
    df['opinion'] = df['opinion'].map(clean_text)
    
    if 'rate' in df.columns:
        df2 = df.drop_duplicates(subset=['opinion', 'rate'], keep='first')
        repeated_opinion_different_rate = df2[df2.duplicated(subset=['opinion'])]
    
        mean_rate = df[df['opinion'].isin(list(repeated_opinion_different_rate['opinion']))].groupby(['opinion']).median()
        for i, opinion in enumerate(mean_rate.index):

            avg_rating = np.round(mean_rate.loc[opinion]['rate']).astype(int)
            if avg_rating >= 8:
                avg_rate1 = 'high'
            elif avg_rating <= 3:
                avg_rate1 = 'low'
            else:
                avg_rate1 = 'medium'

            indexes_to_change = list((df[df['opinion'] == opinion]['rate']).index)
            df.at[indexes_to_change, 'rate'] = avg_rating
            df.at[indexes_to_change, 'rate1'] = avg_rate1
        df = df.drop_duplicates(subset=['opinion', 'rate'], keep='first')
    return df

In [92]:
#creates Vowpal Wabbit-formatted file from tsv file
def to_fasttext(df, location_output_file, test=False):
    with open(location_output_file, "w") as outfile:
    #create a reader to read train file
        #for every line
        for i, row in df.iterrows():
            #if test set label doesnt matter/or isnt available
            label = str(row['rate'])
            imb = int(imbalance[row['rate']])
            phrase = row['opinion']
            outfile.write('__label__'+label+' '+phrase + "\n")

In [None]:
# Put model here
train, test = train_test_split(df, test_size=0.1)
train = clean_data(train)
to_fasttext(train, 'abc.ft')
model = fasttext.train_supervised(input='abc.ft', lr=1.0, epoch=2, wordNgrams=5, ws=7, dim=300)

In [183]:
def get_results(model, train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    ret = np.zeros((len(test_df),)).astype(int)
    pred_by_dict = test_df[test_df['opinion'].isin(train_df_opinions)]
    pred_by_model = test_df[~test_df['opinion'].isin(train_df_opinions)]
    
    print(f'{len(pred_by_dict)} elements in dataset predicted by dict')
    for i, opinion in pred_by_dict['opinion'].iteritems():
        print(f'\r{i}', end='')
        current_val = train_df[train_df['opinion'] == opinion]['rate'].median()
        ret[i] = current_val
        
    # Once we have dict predictions we can clean data
    pred_by_model = clean_data(pred_by_model)
    model_results = model.predict(list(pred_by_model['opinion']))
    for (index, result) in zip(list(pred_by_model.index), model_results[0]):
        prediction = result[0].split('__label__')[1]
        ret[index] = prediction
    df_result = pd.DataFrame(ret)
    df_result.columns = ['prediction']
    return df_result
    
r = get_results(model, train_df, df.iloc[0:100])
r

56 elements in dataset predicted by dict
99

Unnamed: 0,prediction
0,10
1,8
2,10
3,10
4,10
...,...
95,9
96,7
97,10
98,5


In [184]:
r.to_csv('results.csv')