In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

%matplotlib inline


df = pd.read_csv("../data/tracks.csv")
df.head()

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='duration', y='popularity')
plt.title('Зависимость популярности от длительности трека')
plt.xlabel('Длительность (сек)')
plt.ylabel('Популярность')
plt.show()

print("Коэффициент корреляции:")
print(df[['duration', 'popularity']].corr())


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zа-яё\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english') 
              and token not in stopwords.words('russian')]
    return tokens

all_words = []
for lyrics in df['lyrics'].dropna():
    tokens = clean_text(lyrics)
    all_words.extend(tokens)
    
counter = Counter(all_words)
top_words = counter.most_common(20)

print("Топ-20 слов:")
for word, count in top_words:
    print(f"{word}: {count}")


words, counts = zip(*top_words)
plt.figure(figsize=(10,6))
sns.barplot(x=list(counts), y=list(words), palette="viridis")
plt.title("Топ-20 часто встречающихся слов")
plt.xlabel("Количество вхождений")
plt.ylabel("Слова")
plt.show()