In [2]:
import sys
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

In [3]:
#sys.path.append("DataAnalisysMipt\\Python\\5. DataAnalysisApplications")
data = pd.read_csv(
    "SMSSpamCollection.txt",
    "\t",
    header=0,
    names=["label", "message"])
data.head()

Unnamed: 0,label,message
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [4]:
labels, texts = zip(*[(row[0],row[1]) for row in data.as_matrix()])

  """Entry point for launching an IPython kernel.


In [5]:
vectorizer = CountVectorizer().fit(texts)
y = [0 if label == "ham" else 1 for label in labels]
X = vectorizer.transform(texts).toarray()

In [6]:
logistic_classifier = LogisticRegression(random_state=2)
logistic_classifier_score = cross_val_score(logistic_classifier, X, y, cv=10, scoring="f1")



In [10]:
def save_answer_num(fname, number):
    """Функция сохранения в файл ответа, состоящего из одного числа"""
    with open(fname, "w") as fout:
        fout.write(str(number))
        

logistic_classifier_mean_score = np.round(logistic_classifier_score.mean(), 1)
save_answer_num("answer_1.txt", logistic_classifier_mean_score)
print("Mean cross val score for logistic regression classifier is %.1f"
    % logistic_classifier_mean_score)

Mean cross val score for logistic regression classifier is 0.9


In [12]:
unknown_messages = [
    "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB",
    "FreeMsg: Txt: claim your reward of 3 hours talk time",
    "Have you visited the last lecture on physics?",
    "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$",
    "Only 99$"]

In [16]:
log_classifier = LogisticRegression(random_state=2).fit(X, y)
unknown_features = vectorizer.transform(unknown_messages).toarray()
log_classifier_results = [log_classifier.predict(message) for message in unknown_features]

ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [9]:
log_classifier_result_string = " ".join([str(res[0]) for res in log_classifier_results])
print("Default logistic regression results: %s" % (log_classifier_result_string))
save_answer_num("answer_2.txt", log_classifier_result_string)

NameError: name 'log_classifier_results' is not defined

In [17]:
ngram_ranges = [(2,2), (3,3), (1,3)]
def calculate_ngram_result(ngram_range, texts, vectorizer, model):
    vectorizer_ngram = vectorizer(ngram_range=ngram_range).fit(texts)
    X_ngram = vectorizer_ngram.transform(texts)
    # Во всех трех случаях измерьте получившееся в кросс-валидации значение f1-меры,
    return cross_val_score(model, X_ngram, y, cv=10, scoring="f1").mean()
ngram_results = [
    calculate_ngram_result(
        ngram_range,
        texts,
        CountVectorizer,
        LogisticRegression(random_state=2))
    for ngram_range in ngram_ranges]

In [18]:
ngram_result_string = " ".join([str(np.round(res, 2)) for res in ngram_results])
print("Ngram results are: %s" % (ngram_result_string))
save_answer_num("answer_3.txt", ngram_result_string)

Ngram results are: 0.82 0.73 0.93


In [19]:
ngram_bayes_results = [
    calculate_ngram_result(
        ngram_range,
        texts,
        CountVectorizer,
        MultinomialNB())
    for ngram_range in ngram_ranges]

In [20]:
ngram_result_bayes_string = " ".join([str(np.round(res, 2)) for res in ngram_bayes_results])
print("Ngram Bayes results are: %s" % (ngram_result_bayes_string))
save_answer_num("answer_4.txt", ngram_result_bayes_string)


Ngram Bayes results are: 0.65 0.38 0.89


In [21]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1)).fit(texts)
X_tfidf = tfidf_vectorizer.transform(texts).toarray()
logistic_classifier_tfidf = LogisticRegression(random_state=2)
logistic_classifier_score_tfidf = cross_val_score(
    logistic_classifier_tfidf,
    X_tfidf,
    y,
    cv=10,
    scoring="f1")

In [22]:
logistic_classifier_mean_score_tfidf = logistic_classifier_score_tfidf.mean()
print("Count vectorizer score: %f" % (logistic_classifier_mean_score))
print("Tfidf vectorizer score: %f" % (logistic_classifier_mean_score_tfidf))

Count vectorizer score: 0.900000
Tfidf vectorizer score: 0.852860


In [23]:
score_diff = logistic_classifier_mean_score_tfidf - logistic_classifier_mean_score
if np.abs(score_diff) < 0.01:
    answer_9 = 0
else:
    answer_9 = 1 if score_diff > 0 else -1
save_answer_num("answer_5.txt", answer_9)