#### imports and custom functions

In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from imblearn.over_sampling import SMOTE, SVMSMOTE, KMeansSMOTE
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import PolynomialFeatures, MaxAbsScaler, StandardScaler
from imblearn.pipeline import Pipeline
# from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel, chi2, f_classif
from sklearn.metrics import accuracy_score, f1_score
from sklearn.decomposition import SparsePCA, TruncatedSVD

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBRFClassifier

import glob
import difflib
from thefuzz import process
import pysrt
import re
PATH = './'

[nltk_data] Downloading package punkt to /home/gin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/gin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
HTML = r'<.*?>'
TAG = r'{.*?}'
COMMENTS = r'[\(\[][A-Z ]+[\)\]]'
LETTERS = r'[^a-zA-Z\'.,!? ]'
SPACES = r'([ ])\1+'
DOTS = r'[\.]+'

def clean_subs(subs):
    txt = re.sub(HTML, ' ', subs) #html тэги меняем на пробел
    txt = re.sub(TAG, ' ', txt) #тэги меняем на пробел
    txt = re.sub(COMMENTS, ' ', txt) #комменты меняем на пробел
    txt = re.sub(LETTERS, ' ', txt) #все что не буквы меняем на пробел
    txt = re.sub(SPACES, r'\1', txt) #повторяющиеся пробелы меняем на один пробел
    txt = re.sub(DOTS, r'.', txt)  #многоточие меняем на точку
    txt = txt.encode('ascii', 'ignore').decode() #удаляем все что не ascii символы   
    txt = ".".join(txt.lower().split('.')[1:-1]) #удаляем первый и последний субтитр (обычно это реклама)
    return txt

In [3]:
# https://github.com/ronakvijay/IMDB_Sentiment_Analysis/
stop_words = stopwords.words('english') # defining stop_words
lemmatizer = WordNetLemmatizer()

def data_preprocessing(review):
    # data cleaning
    # review = re.sub(re.compile('<.*?>'), '', review) #removing html tags
    # review =  re.sub('[^A-Za-z0-9]+', ' ', review) #taking only words
    # lowercase
    review = review.lower()
    # tokenization
    tokens = nltk.word_tokenize(review) # converts review to tokens
    # stop_words removal
    review = [word for word in tokens if word not in stop_words] #removing stop words
    # lemmatization
    review = [lemmatizer.lemmatize(word) for word in review]
    # join words in preprocessed review
    review = ' '.join(review)
    return review

In [4]:
def sub_connect(df, path=PATH, cutoff=90):
    subpath = path + 'subs/'
    files = glob.glob(subpath+'*.srt') # list of all paths to subtitle files
    file_names = list(map(lambda x: x.split('/')[-1], files)) # list of all subtitle file names
    df['sub_file'] = df['Movie'].apply(lambda x: process.extractOne(x, file_names, score_cutoff=cutoff)) # levanshtein choices
    df = df.dropna(axis=0) # drop rows without found subs
    # df['path'] = df['path'].apply(lambda x: x[0])
    
    df.loc[:, 'sub'] = df.loc[:, 'sub_file'].apply(lambda x: (pysrt.open(subpath+x[0], encoding='iso-8859-1')).text)
    return df

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

#### preprocessing

In [6]:
df = pd.read_excel('movies_labels.xlsx', index_col='id')

In [7]:
df.Level.value_counts()

B2            101
B1             55
C1             40
A2/A2+         26
B1, B2          8
A2              6
A2/A2+, B1      5
Name: Level, dtype: int64

In [8]:
# Here’s what you’re able to understand at each of these levels:

#     A1 – I am able to recognize that someone is speaking this language, e.g. I recognize single words or phrases.
#     A2 – I recognize a lot of words and phrases, but it’s hard to understand what is being said; I often only understand a phrase a few seconds after hearing it.
#     B1 – I can understand more or less what is being said, but I still miss a lot.
#     B2 – I can understand 90% of what is being said, but it’s hard for me to understand some actors, especially when they speak very quickly or with a specific accent, even though they say words that I know.
#     C1 – I am able to watch a movie freely, but I miss some words and expressions. When I watch a comedy, not everything makes me laugh, because I do not understand many cultural references.
#     C2 – I can understand virtually everything including cultural references and hidden meanings.

In [9]:
# pinning intermediate labels to the higher class
df.loc[df['Level']=='A2/A2+, B1', 'Level'] = 'B1'
df.loc[df['Level']=='A2/A2+', 'Level'] = 'A2'
df.loc[df['Level']=='B1, B2', 'Level'] = 'B2'

In [10]:
# label encoding predicting class
mapping = {value:key for key, value in enumerate(df['Level'].unique())}
inv_mapping = {key:value for key, value in enumerate(df['Level'].unique())}

df['Level'].replace(to_replace=mapping, inplace=True)

In [11]:
df = sub_connect(df)
df.drop('sub_file', axis=1, inplace=True)

df.loc[:, 'sub'] = df.loc[:, 'sub'].apply(clean_subs)
df = df[df['sub'].apply(len) > 0]

df['sub'] = df['sub'].apply(data_preprocessing)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'sub'] = df.loc[:, 'sub_file'].apply(lambda x: (pysrt.open(subpath+x[0], encoding='iso-8859-1')).text)


In [12]:
df.iloc[0, 2][:1000]

"enjoy flick ben phone michelle , please n't hang . talk , okay ? ca n't believe left . michelle . come back . please say something . michelle , talk . look , argument . couple fight . reason leave everything behind . running away n't gon na help . michelle , please . newscaster detail . elsewhere today , power still restored many city southern seaboard wake afternoon 's widespread blackout . inclement weather region , problem seems linked authority calling catastrophic power surge crippled traffic area . . ! damn . okay . okay , please . please . please n't hurt . please . let go , okay ? wo n't tell anybody . promise , okay ? please let go . please . man need fluid . shock . going ? 'm going keep alive . work getting handy . boyfriend expecting . 'll send cop looking . 'm sorry . one looking . 've got fight . respect . n't even think trying . 're lucky . generosity extends far . egg . toradol help pain . please . please , let go . please . nowhere go , michelle . looked wallet . give

In [13]:
X = df['sub']
y = df['Level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,
                                                    random_state=42, shuffle=True)

#### pipeline base

In [14]:
pipeline = Pipeline(steps=[['count_vect', CountVectorizer(lowercase=True, min_df=2, max_df=50, ngram_range=(1, 3), decode_error='ignore')],
                           ['tf_id', TfidfTransformer(use_idf=True)],
                           ['sampling', SMOTE()],
                           # ['poly', PolynomialFeatures(interaction_only=True)],
                           # ['scaler', MaxAbsScaler()],
                           # ['pca', TruncatedSVD(n_components=2048)],
                           # ['feature_selection', SelectFromModel(LogisticRegression(class_weight='balanced', n_jobs=-1), max_features=500)],
                           ['feature_selection', SelectKBest(chi2, k=5000)],
                           # ['feature_selection', SelectPercentile(chi2, percentile=70)],
                           # ['poly', PolynomialFeatures(degree=2, interaction_only=True)],
                           # ['denser', Denser()],
                           # ['scaler', StandardScaler()],
                           # ['clf', LogisticRegression(n_jobs=-1)],
                           # ['clf', PassiveAggressiveClassifier(C=1., class_weight='balanced', n_jobs=-1)],
                           # ['clf', MultinomialNB()],
                           # ['clf', XGBRFClassifier(n_estimators=500, booster='gblinear', n_jobs=-1)],                      
                           # ['clf', LGBMClassifier(class_weight='balanced', learning_rate=0.1, n_estimators=100, max_depth=7000, n_jobs=-1)],
                           # ['clf', GradientBoostingClassifier(n_estimators=100, max_depth=500, learning_rate=0.1)],
                           ['clf', AdaBoostClassifier(base_estimator=LogisticRegression(n_jobs=-1), learning_rate=0.01, n_estimators=10)],
                           # ['clf', AdaBoostClassifier(base_estimator=MultinomialNB(), learning_rate=0.1, n_estimators=10)],
                           # ['clf', SVC(kernel='rbf', C=1)],
                           # ['clf', RandomForestClassifier(n_jobs=-1)]
                           # ['clf', BaggingClassifier(LogisticRegression(class_weight='balanced', n_jobs=-1))]
                            ])

In [74]:
# split = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cross_val_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=split, n_jobs=1)
# cross_val_scores.mean()

In [75]:
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
print('accuracy test score: ', accuracy_score(y_test, preds), '\n',
      'f1score test score: ', f1_score(y_test, preds, average='macro'))

accuracy test score:  0.7272727272727273 
 f1score test score:  0.7193001443001443
