# **Загрузка необходимых файлов**

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
%cd /content/drive/My Drive/Colab Notebooks/Files/

/content/drive/My Drive/Colab Notebooks/Files


In [9]:
stem_tfidf = pd.read_csv('hyperc_stem_tfidf.csv')
stem_ftxt = pd.read_csv('hyperc_stem_fasttext.csv', header=None)
stem_valence = pd.read_csv('hyperc_stem_valence.csv')

In [10]:
lem_tfidf = pd.read_csv('hyperc_lem_tfidf.csv')
lem_ftxt = pd.read_csv('hyperc_lem_fasttext.csv', header=None)
lem_valence = pd.read_csv('hyperc_lem_valence.csv')

In [11]:
stem_valence['valence'] = stem_valence['valence'].replace({'neut': 0, 'pos': 1, 'neg': -1})
lem_valence['valence'] = lem_valence['valence'].replace({'neut': 0, 'pos': 1, 'neg': -1})

In [12]:
stem_valence_list = stem_valence['valence'].tolist()
lem_valence_list = lem_valence['valence'].tolist()

In [14]:
mfcc = pd.read_csv('hyperc_audio_mfcc.csv')

In [4]:
spectrogram = pd.read_csv('hyperc_audio_spectrogram.csv')

In [15]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
mfcc_imputed = imputer.fit_transform(mfcc)

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
mfcc_scaled = scaler.fit_transform(mfcc_imputed)

In [5]:
spectrogram_scaled = scaler.fit_transform(spectrogram)

In [55]:
spectrogram_scaled = imputer.fit_transform(spectrogram_scaled)

In [73]:
pd.DataFrame(mfcc_scaled).to_csv('hyperc_mfcc.csv')
pd.DataFrame(spectrogram_scaled).to_csv('hyperc_spectrogram.csv')

# **Разделение текстовых данных и обучение моделей**

## **Разделение**

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_1 = stem_tfidf
X_1_train, X_1_test, y_stem_train, y_stem_test = train_test_split(X_1, stem_valence_list, test_size=0.3, random_state=28)

X_2 = stem_ftxt
X_2_train, X_2_test, y_stem_train, y_stem_test = train_test_split(X_2, stem_valence_list, test_size=0.3, random_state=28)

In [19]:
X_3 = lem_tfidf
X_3_train, X_3_test, y_lem_train, y_lem_test = train_test_split(X_3, lem_valence_list, test_size=0.3, random_state=28)

X_4 = lem_ftxt
X_4_train, X_4_test, y_lem_train, y_lem_test = train_test_split(X_4, lem_valence_list, test_size=0.3, random_state=28)

## **Обучение**

### **Random Forest**

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [21]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=28)
rf_clf.fit(X_1_train, y_stem_train)

y_1_pred_rf = rf_clf.predict(X_1_test)

print(classification_report(y_stem_test, y_1_pred_rf))

              precision    recall  f1-score   support

          -1       1.00      0.18      0.31        11
           0       0.90      1.00      0.94       223
           1       0.00      0.00      0.00        17

    accuracy                           0.89       251
   macro avg       0.63      0.39      0.42       251
weighted avg       0.84      0.89      0.85       251



In [22]:
f1_text_exp1_1 = f1_score(y_stem_test, y_1_pred_rf, average='weighted')

In [23]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=28)
rf_clf.fit(X_2_train, y_stem_train)

y_2_pred_rf = rf_clf.predict(X_2_test)

print(classification_report(y_stem_test, y_2_pred_rf))

              precision    recall  f1-score   support

          -1       1.00      0.09      0.17        11
           0       0.89      1.00      0.94       223
           1       0.00      0.00      0.00        17

    accuracy                           0.89       251
   macro avg       0.63      0.36      0.37       251
weighted avg       0.84      0.89      0.85       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
f1_text_exp1_2 = f1_score(y_stem_test, y_2_pred_rf, average='weighted')

In [25]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=28)
rf_clf.fit(X_3_train, y_lem_train)

y_3_pred_rf = rf_clf.predict(X_3_test)

print(classification_report(y_lem_test, y_3_pred_rf, zero_division=1))

              precision    recall  f1-score   support

          -1       1.00      0.00      0.00         8
           0       0.89      0.99      0.93       222
           1       0.25      0.05      0.08        21

    accuracy                           0.88       251
   macro avg       0.71      0.34      0.34       251
weighted avg       0.84      0.88      0.83       251



In [26]:
f1_text_exp1_3 = f1_score(y_lem_test, y_3_pred_rf, average='weighted')

In [27]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=28)
rf_clf.fit(X_4_train, y_lem_train)

y_4_pred_rf = rf_clf.predict(X_4_test)

print(classification_report(y_lem_test, y_4_pred_rf, zero_division=1))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.89      0.99      0.94       222
           1       0.67      0.10      0.17        21

    accuracy                           0.88       251
   macro avg       0.52      0.36      0.37       251
weighted avg       0.84      0.88      0.84       251



In [28]:
f1_text_exp1_4 = f1_score(y_lem_test, y_4_pred_rf, average='weighted')

### **Stochastic Gradient Descent**

In [29]:
from sklearn.linear_model import SGDClassifier

In [30]:
SGD_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
SGD_clf.fit(X_1_train, y_stem_train)

y_1_pred_SGD = SGD_clf.predict(X_1_test)

print(classification_report(y_stem_test, y_1_pred_SGD))

              precision    recall  f1-score   support

          -1       0.67      0.18      0.29        11
           0       0.90      0.98      0.94       223
           1       0.20      0.06      0.09        17

    accuracy                           0.88       251
   macro avg       0.59      0.41      0.44       251
weighted avg       0.84      0.88      0.85       251



In [31]:
f1_text_exp2_1 = f1_score(y_stem_test, y_1_pred_SGD, average='weighted')

In [32]:
SGD_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
SGD_clf.fit(X_2_train, y_stem_train)

y_2_pred_SGD = SGD_clf.predict(X_2_test)

print(classification_report(y_stem_test, y_2_pred_SGD))

              precision    recall  f1-score   support

          -1       0.40      0.18      0.25        11
           0       0.89      0.97      0.93       223
           1       0.00      0.00      0.00        17

    accuracy                           0.87       251
   macro avg       0.43      0.38      0.39       251
weighted avg       0.81      0.87      0.84       251



In [33]:
f1_text_exp2_2 = f1_score(y_stem_test, y_2_pred_SGD, average='weighted')

In [34]:
SGD_clf = SGDClassifier(loss="hinge", penalty="elasticnet", max_iter=15000)
SGD_clf.fit(X_3_train, y_lem_train)

y_3_pred_SGD = SGD_clf.predict(X_3_test)

print(classification_report(y_lem_test, y_3_pred_SGD))

              precision    recall  f1-score   support

          -1       0.67      0.25      0.36         8
           0       0.90      0.97      0.94       222
           1       0.44      0.19      0.27        21

    accuracy                           0.88       251
   macro avg       0.67      0.47      0.52       251
weighted avg       0.86      0.88      0.86       251



In [35]:
f1_text_exp2_3 = f1_score(y_lem_test, y_3_pred_SGD, average='weighted')

In [36]:
SGD_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
SGD_clf.fit(X_4_train, y_lem_train)

y_4_pred_SGD = SGD_clf.predict(X_4_test)

print(classification_report(y_lem_test, y_4_pred_SGD))

              precision    recall  f1-score   support

          -1       0.15      0.25      0.19         8
           0       0.90      0.89      0.90       222
           1       0.15      0.14      0.15        21

    accuracy                           0.80       251
   macro avg       0.40      0.43      0.41       251
weighted avg       0.82      0.80      0.81       251



In [37]:
f1_text_exp2_4 = f1_score(y_lem_test, y_4_pred_SGD, average='weighted')

### **SVM**

In [38]:
from sklearn.svm import SVC

In [39]:
svm = SVC(kernel='rbf')
svm.fit(X_1_train, y_stem_train)

y_1_pred_svm = svm.predict(X_1_test)

print(classification_report(y_stem_test, y_1_pred_svm))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        11
           0       0.89      1.00      0.94       223
           1       0.00      0.00      0.00        17

    accuracy                           0.89       251
   macro avg       0.30      0.33      0.31       251
weighted avg       0.79      0.89      0.84       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
f1_text_exp3_1 = f1_score(y_stem_test, y_1_pred_svm, average='weighted')

In [41]:
svm = SVC(kernel='rbf')
svm.fit(X_2_train, y_stem_train)

y_2_pred_svm = svm.predict(X_2_test)

print(classification_report(y_stem_test, y_2_pred_svm))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        11
           0       0.89      1.00      0.94       223
           1       0.00      0.00      0.00        17

    accuracy                           0.89       251
   macro avg       0.30      0.33      0.31       251
weighted avg       0.79      0.89      0.84       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
f1_text_exp3_2 = f1_score(y_stem_test, y_2_pred_svm, average='weighted')

In [43]:
svm = SVC(kernel='rbf')
svm.fit(X_3_train, y_lem_train)

y_3_pred_svm = svm.predict(X_3_test)

print(classification_report(y_lem_test, y_3_pred_svm))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.88      1.00      0.94       222
           1       0.00      0.00      0.00        21

    accuracy                           0.88       251
   macro avg       0.29      0.33      0.31       251
weighted avg       0.78      0.88      0.83       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
f1_text_exp3_3 = f1_score(y_lem_test, y_3_pred_svm, average='weighted')

In [45]:
svm = SVC(kernel='rbf')
svm.fit(X_4_train, y_lem_train)

y_4_pred_svm = svm.predict(X_4_test)

print(classification_report(y_lem_test, y_4_pred_svm))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.88      1.00      0.94       222
           1       0.00      0.00      0.00        21

    accuracy                           0.88       251
   macro avg       0.29      0.33      0.31       251
weighted avg       0.78      0.88      0.83       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
f1_text_exp3_4 = f1_score(y_lem_test, y_4_pred_svm, average='weighted')

## **Результаты текстовой классификации**

In [47]:
preprocessing_methods = [
    'Стемминг, TF-IDF',
    'Стемминг, Fasttext',
    'Лемматизация, TF-IDF',
    'Лемматизация, Fasttext',
    'Стемминг, TF-IDF',
    'Стемминг, Fasttext',
    'Лемматизация, TF-IDF',
    'Лемматизация, Fasttext',
    'Стемминг, TF-IDF',
    'Стемминг, Fasttext',
    'Лемматизация, TF-IDF',
    'Лемматизация, Fasttext'
]

ml_methods = [
    'RF',
    'RF',
    'RF',
    'RF',
    'SGD',
    'SGD',
    'SGD',
    'SGD',
    'SVM',
    'SVM',
    'SVM',
    'SVM'
]

results = [
    f1_text_exp1_1,
    f1_text_exp1_2,
    f1_text_exp1_3,
    f1_text_exp1_4,
    f1_text_exp2_1,
    f1_text_exp2_2,
    f1_text_exp2_3,
    f1_text_exp2_4,
    f1_text_exp3_1,
    f1_text_exp3_2,
    f1_text_exp3_3,
    f1_text_exp3_4
]

In [48]:
text_f1 = pd.DataFrame({
    'Методы предобработки': preprocessing_methods,
    'Метод МО': ml_methods,
    'F1 score': results
})

In [49]:
text_f1

Unnamed: 0,Методы предобработки,Метод МО,F1 score
0,"Стемминг, TF-IDF",RF,0.851001
1,"Стемминг, Fasttext",RF,0.845036
2,"Лемматизация, TF-IDF",RF,0.832694
3,"Лемматизация, Fasttext",RF,0.843717
4,"Стемминг, TF-IDF",SGD,0.849929
5,"Стемминг, Fasttext",SGD,0.836351
6,"Лемматизация, TF-IDF",SGD,0.862724
7,"Лемматизация, Fasttext",SGD,0.81031
8,"Стемминг, TF-IDF",SVM,0.835964
9,"Стемминг, Fasttext",SVM,0.835964


# **Разделение аудиопризнков и обучение моделей**

## **Разделение**

In [7]:
from sklearn.model_selection import train_test_split

In [50]:
X_mfcc = mfcc_scaled
X_mfcc_train, X_mfcc_test, y_mfcc_train, y_mfcc_test = train_test_split(X_mfcc, lem_valence_list, test_size=0.3, random_state=28)

In [56]:
X_spectogram = spectrogram_scaled
X_spectogram_train, X_spectogram_test, y_spectogram_train, y_spectogram_test = train_test_split(X_spectogram, lem_valence_list, test_size=0.3, random_state=28)

## **Обучение**

### **RF**

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [52]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=28)
rf_clf.fit(X_mfcc_train, y_mfcc_train)

y_pred_rf_mfcc = rf_clf.predict(X_mfcc_test)

print(classification_report(y_mfcc_test, y_pred_rf_mfcc))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.88      1.00      0.94       222
           1       0.00      0.00      0.00        21

    accuracy                           0.88       251
   macro avg       0.29      0.33      0.31       251
weighted avg       0.78      0.88      0.83       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
f1_audio_exp1_1 = f1_score(y_mfcc_test, y_pred_rf_mfcc, average='weighted')

In [57]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=28)
rf_clf.fit(X_spectogram_train, y_spectogram_train)

y_pred_rf_spectogram = rf_clf.predict(X_spectogram_test)

print(classification_report(y_spectogram_test, y_pred_rf_spectogram))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.88      1.00      0.94       222
           1       0.00      0.00      0.00        21

    accuracy                           0.88       251
   macro avg       0.29      0.33      0.31       251
weighted avg       0.78      0.88      0.83       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
f1_audio_exp1_2 = f1_score(y_spectogram_test, y_pred_rf_spectogram, average='weighted')

### **SGD**

In [59]:
from sklearn.linear_model import SGDClassifier

In [60]:
SGD_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
SGD_clf.fit(X_mfcc_train, y_mfcc_train)

y_pred_SGD_mfcc = SGD_clf.predict(X_mfcc_test)

print(classification_report(y_mfcc_test, y_pred_SGD_mfcc))

              precision    recall  f1-score   support

          -1       0.07      0.50      0.12         8
           0       0.89      0.73      0.80       222
           1       0.00      0.00      0.00        21

    accuracy                           0.66       251
   macro avg       0.32      0.41      0.31       251
weighted avg       0.79      0.66      0.71       251



In [61]:
f1_audio_exp2_1 = f1_score(y_mfcc_test, y_pred_SGD_mfcc, average='weighted')

In [62]:
SGD_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
SGD_clf.fit(X_spectogram_train, y_spectogram_train)

y_pred_SGD_spectrogram = SGD_clf.predict(X_spectogram_test)

print(classification_report(y_spectogram_test, y_pred_SGD_spectrogram))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.87      0.68      0.77       222
           1       0.05      0.10      0.07        21

    accuracy                           0.61       251
   macro avg       0.31      0.26      0.28       251
weighted avg       0.78      0.61      0.68       251



In [63]:
f1_audio_exp2_2 = f1_score(y_spectogram_test, y_pred_SGD_spectrogram, average='weighted')

### **SVM**

In [64]:
from sklearn.svm import SVC

In [65]:
svm = SVC(kernel='rbf')
svm.fit(X_mfcc_train, y_mfcc_train)

y_pred_svm_mfcc = svm.predict(X_mfcc_test)

print(classification_report(y_mfcc_test, y_pred_svm_mfcc))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.88      1.00      0.94       222
           1       0.00      0.00      0.00        21

    accuracy                           0.88       251
   macro avg       0.29      0.33      0.31       251
weighted avg       0.78      0.88      0.83       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
f1_audio_exp3_1 = f1_score(y_mfcc_test, y_pred_svm_mfcc, average='weighted')

In [67]:
svm = SVC(kernel='rbf')
svm.fit(X_spectogram_train, y_spectogram_train)

y_pred_svm_spectrogram = svm.predict(X_spectogram_test)

print(classification_report(y_spectogram_test, y_pred_svm_spectrogram))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.88      1.00      0.94       222
           1       0.00      0.00      0.00        21

    accuracy                           0.88       251
   macro avg       0.29      0.33      0.31       251
weighted avg       0.78      0.88      0.83       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
f1_audio_exp3_2 = f1_score(y_spectogram_test, y_pred_svm_spectrogram, average='weighted')

## **Результаты аудио классификации**

In [69]:
features = [
    'MFCC',
    'Spectogram',
    'MFCC',
    'Spectogram',
    'MFCC',
    'Spectogram'
]

ml_methods = [
    'RF',
    'RF',
    'SGD',
    'SGD',
    'SVM',
    'SVM'
]

results = [
    f1_audio_exp1_1,
    f1_audio_exp1_2,
    f1_audio_exp2_1,
    f1_audio_exp2_2,
    f1_audio_exp3_1,
    f1_audio_exp3_2
]

In [70]:
audio_f1 = pd.DataFrame({
    'Признаки': features,
    'Метод МО': ml_methods,
    'F1 score': results
})

In [71]:
audio_f1

Unnamed: 0,Признаки,Метод МО,F1 score
0,MFCC,RF,0.830235
1,Spectogram,RF,0.830235
2,MFCC,SGD,0.713305
3,Spectogram,SGD,0.684467
4,MFCC,SVM,0.830235
5,Spectogram,SVM,0.830235


# **Результаты**

In [74]:
text_f1

Unnamed: 0,Методы предобработки,Метод МО,F1 score
0,"Стемминг, TF-IDF",RF,0.851001
1,"Стемминг, Fasttext",RF,0.845036
2,"Лемматизация, TF-IDF",RF,0.832694
3,"Лемматизация, Fasttext",RF,0.843717
4,"Стемминг, TF-IDF",SGD,0.849929
5,"Стемминг, Fasttext",SGD,0.836351
6,"Лемматизация, TF-IDF",SGD,0.862724
7,"Лемматизация, Fasttext",SGD,0.81031
8,"Стемминг, TF-IDF",SVM,0.835964
9,"Стемминг, Fasttext",SVM,0.835964


In [75]:
audio_f1

Unnamed: 0,Признаки,Метод МО,F1 score
0,MFCC,RF,0.830235
1,Spectogram,RF,0.830235
2,MFCC,SGD,0.713305
3,Spectogram,SGD,0.684467
4,MFCC,SVM,0.830235
5,Spectogram,SVM,0.830235
