In [9]:
import requests
import pandas as pd
from datetime import datetime
import time
from datetime import timedelta
import pickle
import re
import preprocessor as p
from nltk.corpus import stopwords
import pymorphy2
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import collections
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from IPython.display import Markdown as md

# Part 1

### Retrieving posts 
Posts were retrieved from the following vk groups: 
* https://vk.com/itmomagistry 
* https://vk.com/careers_service_itmo 
* https://vk.com/itmoru 
* https://vk.com/itmo_exchange 
* https://m.vk.com/scicomm).

In [217]:
def get_posts(group):

    count = 100
    offset = 0 
    posts = []
    retrieved_posts = 'yes'
    
    while retrieved_posts:

        url = 'https://api.vk.com/method/wall.get'
        params = {
            'domain': group,
            'filter': 'owner',
            'count': count,
            'offset': offset,
            'access_token': '8e84f1f18e84f1f18e84f1f1db8eeb68e388e848e84f1f1d0bb32bd2b9ea45a67630377',
            'v': 5.73
        }

        retrieved_posts = requests.get(url, params = params).json()['response']['items']

        posts += retrieved_posts 
        offset += count   
        time.sleep(0.5)
            
    return posts

In [218]:
groups = ['itmomagistry', 'careers_service_itmo', 'itmoru', 'itmo_exchange', 'scicomm']

posts = []
for group in groups:
    posts += get_posts(group=group)

In [10]:
# pickle.dump( posts, open( "posts.p", "wb" ) )
posts = pickle.load( open( "posts.p", "rb" ) )

### Splitting posts into old (older than a year ago) and new (later than a year ago) ones. 
The old ones will be used for training. The new ones will be used for testing. 

In [11]:
def split_train_test(posts):
    test = []
    train = []
    
    year_ago = (datetime.now()-timedelta(days=365)).timestamp()
    
    for post in posts:
        if post['date'] > year_ago:
            test.append(post)
        else:
            train.append(post)
            
    return test, train

In [12]:
test_posts, train_posts = split_train_test(posts)

In [13]:
md(f'We have retrieved {len(posts)} posts.<br>{len(train_posts)} from them were published before February 2019. These posts will be used for training<br>{len(test_posts)} were published after February 2019. These posts will be used for testing')

We have retrieved 13586 posts.<br>11105 from them were published before February 2019. These posts will be used for training<br>2481 were published after February 2019. These posts will be used for testing

#### Post example

In [14]:
train_posts[0]

{'id': 3764,
 'from_id': -54201931,
 'owner_id': -54201931,
 'date': 1549961199,
 'marked_as_ads': 0,
 'post_type': 'post',
 'text': '✨День российской науки.\nПрофессиональный праздник отмечается в честь основания в нашей стране Академии наук. Она была создана 295 лет назад.\n📝 Подробности читайте по ссылке: \nhttps://www.1tv.ru/news/2019-02-08/360076-segodnya_den_rossiyskoy_nauki',
 'attachments': [{'type': 'video',
   'video': {'access_key': '27143b75f5296c628d',
    'can_comment': 0,
    'can_like': 1,
    'can_repost': 1,
    'can_subscribe': 1,
    'can_add_to_faves': 1,
    'can_add': 1,
    'comments': 0,
    'date': 1549961199,
    'description': 'Профессиональный праздник отмечается в\xa0честь основания в\xa0нашей стране Академии наук. Она была создана 295 лет назад.',
    'duration': 0,
    'photo_130': 'https://sun9-59.userapi.com/c852216/v852216335/b53c1/ADxFBZJ6jwo.jpg',
    'photo_320': 'https://sun9-11.userapi.com/c852216/v852216335/b53c3/n1zeZST6cbU.jpg',
    'photo_640

# Part 2 

###  Retrieving text from posts and getting rid of duplicate posts

In [15]:
def get_post_text(post):
    if 'copy_history' in post:
        text = post['copy_history'][0]['text']
    else:
        text = post['text']
    return text.lower()

In [16]:
test_texts = {get_post_text(post) for post in test_posts}
train_texts = {get_post_text(post) for post in train_posts}

In [17]:
md(f'Found {len(train_posts) - len(train_texts)} duplicate posts from test set and {len(test_posts) - len(test_texts)} from train set.')

Found 622 duplicate posts from test set and 101 from train set.

### Filtering out posts that mention internships, scholarships, grants etc.

In [18]:
def get_texts_by_keywords(texts, words = {'стажировк', 'стажер', 'стажёр', 'scholarship', 'стипенд', 'грант', 'intern', 'обмен'}):
    return [text for text in texts if any(word in text for word in words)]

In [19]:
train_texts = get_texts_by_keywords(train_texts)

### Dumping 500 of selected posts to excel sheet in order to label them manually.

In [21]:
pd.DataFrame(zip(train_texts[:500], [0]*500)).to_excel('train.xlsx', index = False)

# Part 3

In the following section we'll perform text classification.
<br>**Input**: texts of posts mentionning internships, scholarships etc. with a corresponding class (labeled manually)
<br>**Output**: one of the following classes


#### Classes:
* 0 - other
* 1 - internships
* 3 - scholarships, grants

### Retrieving labeled data from excel sheet.

In [29]:
df = pd.read_excel('train_part2.xlsx')
X_not_processed = df[0]
y = df[1]

### Preprocessing data:
* removing urls, emojis and numbers
* removing punctuation signs
* lemmatization 
* removing stopwords

In [30]:
def preprocess(text, morph, stopwords):
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.NUMBER)
    text = p.clean(text)

    words = [morph.parse(word)[0].normal_form for word in re.findall(r'\w+', text)]
    words = [word for word in words if word not in stopwords]
    return " ".join(words)

In [31]:
morph = pymorphy2.MorphAnalyzer()
# russian_stopwords = get_stop_words('ru')
russian_stopwords = stopwords.words("russian")

#### Example
**Before preprocessing:**

In [32]:
X_not_processed[0]

'как пройти собеседование на английском\n\nсобеседование на новую работу – непростая задача. а если вы нашли идеальную для вас вакансию в международной компании, то вас ждет еще более серьезное испытание – собеседование на иностранном языке. это может вызывать неуверенность, даже если вы отличный профессионал! [club21199653|kaplan international english] подготовил советы для вас, как подготовиться к разговору с работодателем на английском языке, и легко получить работу мечты!\n\nhttp://kaplaninternational.com/rus/blog/how-to-interview-in-english/'

**After preprocessing:**

In [35]:
preprocess(X_not_processed[0], morph, russian_stopwords)

'пройти собеседование английский собеседование новый работа непростой задача найти идеальный вакансия международный компания ждать ещё серьёзный испытание собеседование иностранный язык это мочь вызывать неуверенность отличный профессионал club21199653 kaplan international english подготовить совет подготовиться разговор работодатель английский язык легко получить работа мечта'

In [37]:
X = [preprocess(text, morph, russian_stopwords) for text in X_not_processed]

### Splitting train data into training and validation sets.

In [38]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

### Choosing the best classifier

In [39]:
vectorizers = {'TfidfVectorizer':TfidfVectorizer(), 
               'CountVectorizer':CountVectorizer()
              }
classifiers = {'GradientBoostingClassifier':GradientBoostingClassifier(random_state = 0), 
               'RandomForestClassifier':RandomForestClassifier(random_state = 0), 
               'LinearSVC':LinearSVC(random_state = 0), 
               'MLPClassifier':MLPClassifier(random_state = 0)}

In [40]:
f1_scores = {}
for vectorizer_name, vectorizer in vectorizers.items():
    scores = {}
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)
    for classifier_name, classifier in classifiers.items():
        classifier.fit(X_train_vec, y_train)
        predictions = classifier.predict(X_val_vec)
        scores[classifier_name] = f1_score(y_val, predictions, average = 'weighted')
    f1_scores[vectorizer_name] = scores

### Classification results

In [41]:
pd.DataFrame(f1_scores)

Unnamed: 0,TfidfVectorizer,CountVectorizer
GradientBoostingClassifier,0.673871,0.664512
LinearSVC,0.775001,0.767615
MLPClassifier,0.686452,0.726754
RandomForestClassifier,0.638984,0.634742


Looks like the most promissing results are achieved by the **GradientBoostingClassifier** together with **TfidfVectorizer**. 
<br>LinearSVC together with **TfidfVectorizer** gives a slightly worse result while being much simpler than GradientBoostingClassifier. The latter looks too sophisticated for our simple problem, so let's stick to the LinearSVC model and try to improve this model's performance by tuning hyperparameters using GridSearchCV.

In [42]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [43]:
param_grid = {'C':np.arange(0.01,100,10), 'max_iter':[1, 5, 10, 50, 100]}
gs = GridSearchCV(LinearSVC(),param_grid,cv=5,return_train_score=True)
gs.fit(X_train_vec,y_train)
best_parameters = gs.best_params_

In [44]:
print(f'Best parameters: {best_parameters}')

Best parameters: {'C': 70.01, 'max_iter': 5}


In [45]:
clf = LinearSVC(**best_parameters, random_state = 0)
clf.fit(X_train_vec,y_train)
predictions = clf.predict(X_val_vec)

In [46]:
best_f1_score = np.around(f1_score(y_val, predictions, average = 'weighted'), decimals=2)

In [47]:
print(f'F1 score of the model with tuned hyperparameters: {best_f1_score}')

F1 score of the model with tuned hyperparameters: 0.78


# Part 4
In this section we'll try to detect topics of posts about internships and about scholarships.
<br>First, let's try to extract topics automatically using LDA model.

In [48]:
def lda(values, num_topics=3, num_words=5):
    values = [preprocess(text, morph, russian_stopwords).split() for text in values]
    dictionary = corpora.Dictionary(values)
    corpus = [dictionary.doc2bow(text) for text in values]
    ldamodel = LdaModel(corpus, num_topics, id2word=dictionary, passes=15)
    topics = ldamodel.print_topics(num_words)
    return topics

In [49]:
selected_posts = {'internships': [post for post, c in zip(X_not_processed, predictions) if c == 1],
                  'scholarships': [post for post, c in zip(X_not_processed, predictions) if c == 3]}

In [50]:
for type, values in selected_posts.items():
    print(type, '\n')
    topics = lda(values)
    for i, topic in enumerate(topics):
        print(i, topic)
    print()

internships 

0 (0, '0.012*"работа" + 0.012*"университет" + 0.012*"итмый" + 0.010*"свой" + 0.009*"компания" + 0.009*"собеседование" + 0.009*"самый" + 0.007*"мочь" + 0.007*"стипендия" + 0.007*"работодатель"')
1 (1, '0.009*"стажировка" + 0.009*"cup" + 0.009*"получать" + 0.009*"получить" + 0.009*"компания" + 0.009*"информация" + 0.009*"инженер" + 0.007*"чемпионат" + 0.007*"кейс" + 0.007*"поступление"')
2 (2, '0.015*"итмый" + 0.013*"студент" + 0.013*"университет" + 0.013*"конкурс" + 0.011*"участие" + 0.009*"стипендия" + 0.009*"участник" + 0.009*"научный" + 0.009*"проект" + 0.009*"школа"')

scholarships 

0 (0, '0.014*"компания" + 0.011*"свой" + 0.011*"мероприятие" + 0.011*"ведущий" + 0.011*"стажировка" + 0.009*"апрель" + 0.009*"работа" + 0.009*"деловой" + 0.009*"студент" + 0.006*"это"')
1 (1, '0.011*"конкурс" + 0.011*"участие" + 0.011*"студент" + 0.011*"проект" + 0.011*"стажировка" + 0.010*"итмый" + 0.010*"it" + 0.008*"международный" + 0.008*"научный" + 0.007*"год"')
2 (2, '0.014*"студент"

Unfortunately, there doesn't seem to be any topics that could be easily extracted by the model (even if we tune number of topics). So let's use the good old string matching to find **the most wanted skills** in the intership posts.

First, let's look at the most popular words in the posts.

In [51]:
internship_texts = [preprocess(text, morph, russian_stopwords) for text in selected_posts['internships']]
internship_words = [text.split() for text in internship_texts]
internship_words = [item for sublist in internship_words for item in sublist]
counter = collections.Counter(internship_words)
print(counter.most_common(200))

[('итмый', 17), ('университет', 14), ('студент', 11), ('компания', 11), ('стипендия', 11), ('работа', 10), ('свой', 9), ('участие', 8), ('itmo', 7), ('стажировка', 7), ('весь', 7), ('получить', 7), ('опыт', 7), ('мочь', 6), ('конкурс', 6), ('собеседование', 6), ('получать', 6), ('самый', 6), ('который', 6), ('стать', 5), ('информация', 5), ('дать', 5), ('участник', 5), ('пройти', 5), ('проект', 5), ('вопрос', 5), ('cup', 5), ('поступление', 5), ('работодатель', 5), ('международный', 4), ('направление', 4), ('инженер', 4), ('школа', 4), ('ведущий', 4), ('грант', 4), ('кейс', 4), ('чемпионат', 4), ('россия', 4), ('научный', 4), ('важный', 4), ('это', 3), ('обучающийся', 3), ('год', 3), ('принять', 3), ('дробный', 3), ('ulsee', 3), ('программа', 3), ('абитуриент', 3), ('обучение', 3), ('блокчейн', 3), ('каждый', 3), ('институт', 3), ('регистрация', 3), ('являться', 3), ('ещё', 3), ('день', 3), ('открытый', 3), ('дверь', 3), ('решение', 3), ('заявка', 3), ('право', 3), ('получение', 3), ('

### Now let's choose the words that correspond to some programmer's skills and are relatively popular in the dataset.

In [52]:
key_words = ['java', 'c', 'python', 'javascript', 'frontend', 'backend', 'linux', 'office',
             'bi', 'data science', 'анализ дать', 'машинный обучение', 'machine learning', 'аналитик', 
             'тестировщик', 'тестирование', 'английский', 'дизайн', 'разработка', 'разработчик', 'инженер','1с']

In [53]:
def get_most_popular_skills(key_words, texts):
    skills_dict = dict.fromkeys(key_words, 0)
    for text in texts:
        for word in key_words:
            if word in text:
                skills_dict[word] += 1
    skills_dict = {skill: count for skill, count in sorted(skills_dict.items(), key=lambda item: item[1], reverse=True)}
    return skills_dict

In [54]:
skills_dict = get_most_popular_skills(key_words, internship_texts)
for skill, count in skills_dict.items():
    print(f'{skill}: {count}')

c: 7
bi: 1
анализ дать: 1
машинный обучение: 1
дизайн: 1
инженер: 1
java: 0
python: 0
javascript: 0
frontend: 0
backend: 0
linux: 0
office: 0
data science: 0
machine learning: 0
аналитик: 0
тестировщик: 0
тестирование: 0
английский: 0
разработка: 0
разработчик: 0
1с: 0


# Part 5
## Test data
Now let's take a look at the test set, i.e. texts published before Ferbuary 2019.

#### First, let's use our classifier to extract texts that announce internships and scholarships. 

In [55]:
test_texts_orig = get_texts_by_keywords(test_texts)
test_texts = [preprocess(text, morph, russian_stopwords) for text in test_texts_orig]

In [56]:
test_texts_vec = vectorizer.transform(test_texts)
test_predictions = clf.predict(test_texts_vec)

#### Now let's dump texts and our predictions to a csv file.

In [57]:
pd.DataFrame(zip(test_texts_orig, test_predictions), 
             columns = ['text', 'class']).to_csv('part_3.csv', index=False)

In [58]:
test_internships = [post for post, c in zip(test_texts, test_predictions) if c == 1]

#### What are the most wanted skills in 2019-2020?

In [59]:
test_skills_dict = get_most_popular_skills(key_words, test_internships)
for skill, count in test_skills_dict.items():
    print(f'{skill}: {count}')

c: 54
английский: 16
аналитик: 12
разработка: 12
bi: 11
инженер: 11
тестирование: 10
разработчик: 10
java: 7
python: 7
frontend: 5
backend: 4
data science: 3
дизайн: 3
javascript: 2
linux: 2
анализ дать: 2
машинный обучение: 2
тестировщик: 1
office: 0
machine learning: 0
1с: 0


### What can we tell from the data?
* Most of the companies were looking for developers, most of them for c and c++ languages.
* Machine Learning, python, data science, business intelligence speciallists are much more likely to find an internship in 2020 than in the previous years.

In [60]:
def get_keywords_from_text(key_words, text):
    result = []
    for word in key_words:
        if word in text:
            result.append(word)
    return result

In [61]:
test_internships_keywords = [get_keywords_from_text(key_words, text) for text in test_internships]

#### Now let's dump internships texts and our predictions to a csv file.

In [62]:
test_internships_orig = [post for post, c in zip(test_texts_orig, test_predictions) if c == 1]
pd.DataFrame(zip(test_internships_orig, test_internships_keywords), 
             columns = ['text', 'keywords']).to_csv('part_4.csv', index=False)