# 1 пункт

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
category_map = {"misc.forsale": 'Sales', "rec.motorcycles": 'Motorcycles',
                "rec.sport.baseball": 'Baseball', "sci.crypt": 'Cryptography',
                "sci.space": 'Space'}
categorys = category_map.keys()
categorys

dict_keys(['misc.forsale', 'rec.motorcycles', 'rec.sport.baseball', 'sci.crypt', 'sci.space'])

In [7]:
training_data = fetch_20newsgroups(subset='train', categories=category_map.keys(), shuffle=True, random_state=7)

In [9]:
training_data.data

["From: demers@cs.ucsd.edu (David DeMers)\nSubject: Re: Montreal Question.......\nOrganization: CSE Dept., UC San Diego\nLines: 13\nNntp-Posting-Host: mbongo.ucsd.edu\n\n\nIn article <1993Apr19.015442.15723@oz.plymouth.edu>, k_mullin@oz.plymouth.edu (Mully) writes:\n|> \n|>    What position does Mike Lansing play?  I cannot seem to find it \n|>  anywhere.  Thanks!!!!1\n\nHe's a shortstop by training, but he's been at second (mostly) and third\nthis year for the Expos.\n-- \nDave DeMers\t\t\t \t        demers@cs.ucsd.edu\nComputer Science & Engineering\t0114\t\tdemers%cs@ucsd.bitnet\nUC San Diego\t\t\t\t\t...!ucsd!cs!demers\nLa Jolla, CA 92093-0114\t(619) 534-0688, or -8187, FAX: (619) 534-7029\n",
 'From: bclarke@galaxy.gov.bc.ca\nSubject: Re: First Bike??\nOrganization: BC Systems Corporation\nLines: 8\n\nIn article <0forqFa00iUzMATnMz@andrew.cmu.edu>, James Leo Belliveau <jbc9+@andrew.cmu.edu> writes:\n>     I am a serious motorcycle enthusiast without a motorcycle, and to\n> put it 

In [11]:
len(training_data.data)

2968

In [13]:
training_data.target_names

['misc.forsale',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.crypt',
 'sci.space']

In [15]:
training_data.data[0], training_data.target[0]

("From: demers@cs.ucsd.edu (David DeMers)\nSubject: Re: Montreal Question.......\nOrganization: CSE Dept., UC San Diego\nLines: 13\nNntp-Posting-Host: mbongo.ucsd.edu\n\n\nIn article <1993Apr19.015442.15723@oz.plymouth.edu>, k_mullin@oz.plymouth.edu (Mully) writes:\n|> \n|>    What position does Mike Lansing play?  I cannot seem to find it \n|>  anywhere.  Thanks!!!!1\n\nHe's a shortstop by training, but he's been at second (mostly) and third\nthis year for the Expos.\n-- \nDave DeMers\t\t\t \t        demers@cs.ucsd.edu\nComputer Science & Engineering\t0114\t\tdemers%cs@ucsd.bitnet\nUC San Diego\t\t\t\t\t...!ucsd!cs!demers\nLa Jolla, CA 92093-0114\t(619) 534-0688, or -8187, FAX: (619) 534-7029\n",
 2)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_termcounts = vectorizer.fit_transform(training_data.data)
print("\nDimensions of training data:", X_train_termcounts.shape)


Dimensions of training data: (2968, 40605)


In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
input_data = [
    "The curveballs of right handed pitchers tend to curve to the left",
    "Caesar cipher is an ancient form of encryption",
    "This two-wheeler is really good on slippery roads"
]

In [21]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_termcounts)

In [23]:
classifier = MultinomialNB().fit(X_train_tfidf, training_data.target)

In [25]:
X_input_termcounts = vectorizer.transform(input_data)
X_input_tfidf = tfidf_transformer.transform(X_input_termcounts)

In [27]:
predicted_categories = classifier.predict(X_input_tfidf)

In [29]:
for sentence, category in zip(input_data, predicted_categories):
  print('\nInput:', sentence, '\nPredicted category:', \
        category_map[training_data.target_names[category]])


Input: The curveballs of right handed pitchers tend to curve to the left 
Predicted category: Baseball

Input: Caesar cipher is an ancient form of encryption 
Predicted category: Cryptography

Input: This two-wheeler is really good on slippery roads 
Predicted category: Motorcycles


In [31]:
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [32]:
X_train = training_data.data[:2000]
Y_train = training_data.target[:2000]
X_test = training_data.data[2000:]
Y_test = training_data.target[2000:]

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('sgd_clf', SGDClassifier(random_state=42))])
knb_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knb_clf', KNeighborsClassifier(n_neighbors=10))])
sgd_ppl_clf.fit(X_train, Y_train)
knb_ppl_clf.fit(X_train, Y_train)

In [39]:
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, Y_test))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       181
           1       0.98      0.97      0.98       188
           2       0.97      0.97      0.97       211
           3       0.97      0.98      0.97       204
           4       0.97      0.98      0.98       184

    accuracy                           0.97       968
   macro avg       0.97      0.97      0.97       968
weighted avg       0.97      0.97      0.97       968



In [41]:
predict_sgd = knb_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, Y_test))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       181
           1       0.98      0.97      0.98       188
           2       0.97      0.97      0.97       211
           3       0.97      0.98      0.97       204
           4       0.97      0.98      0.98       184

    accuracy                           0.97       968
   macro avg       0.97      0.97      0.97       968
weighted avg       0.97      0.97      0.97       968



In [45]:
parameters = {
    'sgd_clf__loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron'],
    'sgd_clf__class_weight': [None, 'balanced'],
    'sgd_clf__penalty': [None, 'l2', 'l1', 'elasticnet'],
    'tfidf__strip_accents': ['ascii', 'unicode', None],
    'tfidf__ngram_range': [(1, 2), (2, 3), (3, 4)]
}

model = GridSearchCV(sgd_ppl_clf,  parameters, cv = 4, n_jobs = -1).fit(X_train, Y_train)
print('Best score and parameters combination:')
print(model.best_score_, model.best_params_)

Best score and parameters combination:
0.9714999999999999 {'sgd_clf__class_weight': None, 'sgd_clf__loss': 'hinge', 'sgd_clf__penalty': 'elasticnet', 'tfidf__ngram_range': (1, 2), 'tfidf__strip_accents': 'ascii'}


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
sgd_ppl_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,2))),
                        ('sgd_clf', SGDClassifier(penalty='elasticnet', class_weight='balanced', random_state=42))])
sgd_ppl_clf.fit(X_train, Y_train)
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, Y_test))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       182
           1       0.98      0.96      0.97       191
           2       0.97      0.97      0.97       210
           3       0.97      0.98      0.98       203
           4       0.97      1.00      0.99       182

    accuracy                           0.97       968
   macro avg       0.97      0.97      0.97       968
weighted avg       0.97      0.97      0.97       968



In [49]:
from sklearn.svm import SVC

svm_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm_clf', SVC())
])

In [51]:
svm_ppl_clf.fit(X_train, Y_train)

In [53]:
predicted_svm = svm_ppl_clf.predict(X_test)

In [55]:
print(metrics.classification_report(predicted_svm, Y_test))

              precision    recall  f1-score   support

           0       0.96      0.85      0.90       200
           1       0.97      0.96      0.97       190
           2       0.95      0.98      0.96       204
           3       0.92      0.99      0.95       191
           4       0.95      0.97      0.96       183

    accuracy                           0.95       968
   macro avg       0.95      0.95      0.95       968
weighted avg       0.95      0.95      0.95       968



In [57]:
def print_predictions(model_name, predicted_categories):
    print(f"\n{model_name} Predictions:")
    for sentence, category in zip(input_data, predicted_categories):
        print('\nInput:', sentence, '\nPredicted category:', category_map[training_data.target_names[category]])

print_predictions("SGD", predicted_sgd)
print_predictions("SVM", predicted_svm)


SGD Predictions:

Input: The curveballs of right handed pitchers tend to curve to the left 
Predicted category: Cryptography

Input: Caesar cipher is an ancient form of encryption 
Predicted category: Cryptography

Input: This two-wheeler is really good on slippery roads 
Predicted category: Space

SVM Predictions:

Input: The curveballs of right handed pitchers tend to curve to the left 
Predicted category: Cryptography

Input: Caesar cipher is an ancient form of encryption 
Predicted category: Cryptography

Input: This two-wheeler is really good on slippery roads 
Predicted category: Space


Тестирование моделей на своих примерах:

In [60]:
input_data = [
  "ESA announces collaboration with SpaceX for joint mission to study Martian atmosphere",
  "Researchers develop breakthrough encryption method with quantum-resistant properties",
  "New line of all-terrain bikes unveiled, promising unmatched performance in rugged landscapes",
  "Tech giants team up to offer massive markdowns on gadgets in summer blowout sale",
  "Baseball history made as rookie pitcher achieves flawless game performance"
]

In [62]:
predicted_sgd_input = sgd_ppl_clf.predict(input_data)
predicted_svm_input = svm_ppl_clf.predict(input_data)

In [64]:
def print_predictions(model_name, predicted_categories):
    print(f"\n{model_name} Predictions on Input Data:")
    for sentence, category in zip(input_data, predicted_categories):
        print('\nInput:', sentence, '\nPredicted category:', category_map[training_data.target_names[category]])

In [66]:
print_predictions("SGD", predicted_sgd_input)
print_predictions("SVM", predicted_svm_input)


SGD Predictions on Input Data:

Input: ESA announces collaboration with SpaceX for joint mission to study Martian atmosphere 
Predicted category: Sales

Input: Researchers develop breakthrough encryption method with quantum-resistant properties 
Predicted category: Cryptography

Input: New line of all-terrain bikes unveiled, promising unmatched performance in rugged landscapes 
Predicted category: Motorcycles

Input: Tech giants team up to offer massive markdowns on gadgets in summer blowout sale 
Predicted category: Sales

Input: Baseball history made as rookie pitcher achieves flawless game performance 
Predicted category: Baseball

SVM Predictions on Input Data:

Input: ESA announces collaboration with SpaceX for joint mission to study Martian atmosphere 
Predicted category: Sales

Input: Researchers develop breakthrough encryption method with quantum-resistant properties 
Predicted category: Sales

Input: New line of all-terrain bikes unveiled, promising unmatched performance in r

# 2 пункт

In [69]:
training_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=7)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=7)

In [71]:
sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('sgd_clf', SGDClassifier(penalty='elasticnet', class_weight='balanced', random_state=42))
])

In [73]:
sgd_ppl_clf.fit(training_data.data, training_data.target)

In [75]:
predicted_sgd_test = sgd_ppl_clf.predict(test_data.data)
predicted_sgd_input = sgd_ppl_clf.predict(input_data)

In [77]:
print("SGD Classification Report on Test Data:\n")
print(metrics.classification_report(test_data.target, predicted_sgd_test, target_names=test_data.target_names))

SGD Classification Report on Test Data:

                          precision    recall  f1-score   support

             alt.atheism       0.75      0.75      0.75       319
           comp.graphics       0.77      0.72      0.74       389
 comp.os.ms-windows.misc       0.72      0.79      0.76       394
comp.sys.ibm.pc.hardware       0.76      0.70      0.73       392
   comp.sys.mac.hardware       0.80      0.84      0.82       385
          comp.windows.x       0.87      0.75      0.81       395
            misc.forsale       0.85      0.90      0.88       390
               rec.autos       0.90      0.87      0.88       396
         rec.motorcycles       0.93      0.95      0.94       398
      rec.sport.baseball       0.89      0.93      0.91       397
        rec.sport.hockey       0.92      0.97      0.94       399
               sci.crypt       0.89      0.95      0.92       396
         sci.electronics       0.80      0.69      0.74       393
                 sci.med       0.9

# 3 пункт

In [80]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 20000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)

In [82]:
print(len(X_train))
print(len(X_test))
print(len(Y_train))
print(len(Y_test))

25000
25000
2000
968


In [84]:
word_index = imdb.get_word_index()
index_to_word = {index + 3: word for word, index in word_index.items()}
index_to_word[0], index_to_word[1], index_to_word[2] = '<PAD>', '<START>', '<UNK>'

In [86]:
new_reviews = [
    "This film is a real work of art! The plot captures from the first minutes, and the acting is simply amazing with its sincerity and emotionality.",
    "Unfortunately, this film left me indifferent. The flat plot and unconvincing characters make it oblivious immediately after watching.",
    "A movie that is definitely worth watching for everyone! An exciting plot, amazing acting and visual splendor make it a real masterpiece of cinematography.",
    "This film is a real find for connoisseurs of intrigue and non-standard plots. It is filled with unpredictable twists that keep the viewer in suspense until the very end.",
    "A film that makes you think about the deep philosophical questions of life. He takes the viewer into an amazing world where every scene is imbued with meaning and emotion.",
    "This movie is an ideal choice for an evening viewing in the company of friends. He will give you a lot of positive emotions, make you laugh to tears and leave the warmest memories.",
    "A film that disappointed expectations. A flat plot, ridiculous dialogues and an incredibly predictable turn of events make it a waste of time and money.",
    "An incredibly touching film that makes you think about the importance of family and friendship. It is filled with emotional moments that stay with you for a long time after watching.",
    "This film is a true discovery for fans of art cinema. It impresses with its beauty and depth of thought, transporting the viewer into an amazing world of fantasy and imagination.",
    "A film that caused controversy among critics and viewers. Some see it as a work of genius, while others consider it meaningless and boring."
]

new_true_sentiment = [1,0,1,1,1,0,1,1,1]

In [88]:
def decode_review(encoded_review):
    return ' '.join([index_to_word.get(i, '?') for i in encoded_review])

In [90]:
X_train_text = [decode_review(review) for review in X_train]
X_test_text = [decode_review(review) for review in X_test]

In [96]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1, 2))),
    ('sgd', SGDClassifier(loss='log_loss', random_state=42))
])

In [98]:
pipeline.fit(X_train_text, y_train)

In [100]:
y_pred = pipeline.predict(X_test_text)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88     12500
           1       0.88      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [102]:
new_predictions = pipeline.predict(new_reviews)
for review, sentiment,true_sentiment in zip(new_reviews, new_predictions, new_true_sentiment):
    print(f"\nReview: {review}\nPredicted Sentiment: {'Positive' if sentiment == 1 else 'Negative'} \nTrue Sentiment: {'Positive' if true_sentiment == 1 else 'Negative'}")


Review: This film is a real work of art! The plot captures from the first minutes, and the acting is simply amazing with its sincerity and emotionality.
Predicted Sentiment: Positive 
True Sentiment: Positive

Review: Unfortunately, this film left me indifferent. The flat plot and unconvincing characters make it oblivious immediately after watching.
Predicted Sentiment: Negative 
True Sentiment: Negative

Review: A movie that is definitely worth watching for everyone! An exciting plot, amazing acting and visual splendor make it a real masterpiece of cinematography.
Predicted Sentiment: Positive 
True Sentiment: Positive

Review: This film is a real find for connoisseurs of intrigue and non-standard plots. It is filled with unpredictable twists that keep the viewer in suspense until the very end.
Predicted Sentiment: Positive 
True Sentiment: Positive

Review: A film that makes you think about the deep philosophical questions of life. He takes the viewer into an amazing world where eve