In [None]:
import pandas as pd
data = pd.read_csv('2024“中促杯”全国大学生数据分析大赛例题数据.csv')
data.head()

In [None]:
import re
data['citys'] = data['citys'].apply(lambda x: re.findall(r'[^"\\[\]]+', str(x))[0] if re.findall(r'[^"\\[\]]+', str(x)) else None)
data['scores'] = data['scores'].apply(lambda x: int(re.findall(r'[0-9]+', str(x))[0]) if re.findall(r'[0-9]+', str(x)) else None)
data['content'] = data['content'].apply(lambda x: re.sub(r'[^\u4E00-\u9FD5]+', ',', str(x)) if x is not None else None)
data

In [None]:
import matplotlib.pyplot as plt
num = data['scores'].value_counts()

plt.figure(figsize = (4,4))
plt.rcParams['font.sans-serif'] = 'Simhei'
plt.pie(num, autopct="%.2f%%", labels=num.index)
plt.title('《流浪地球》豆瓣评分分布表')
plt.show()

In [None]:
num = data['times'].apply(lambda x:x.split()[0]).value_counts()
num2 = num.sort_index()

plt.figure(figsize=(8,5))
plt.plot(range(len(num2)),num2)
plt.xticks(range(len(num2)),num2.index,rotation = 45)
plt.title('评论数量随日期的变化图')
plt.grid()
plt.show()

In [None]:
num = pd.to_datetime(data['times']).apply(lambda x:x.hour).value_counts()
num2 = num.sort_index()
plt.plot(num2.index, num2)
plt.grid()
plt.xticks(num2.index, num2.index)
plt.title('评论数量随时刻的变化图')
plt.show()

In [None]:
data['date'] = data['times'].apply(lambda x: x.split()[0])
tmp = pd.DataFrame(0, columns=data['scores'].astype(str).drop_duplicates().sort_values(),
                   index=data['date'].drop_duplicates())
tmp.sort_index(inplace = True)

for i,j in zip(data['scores'], data['date']):
    tmp.loc[j,str(i)] += 1

plt.figure(figsize = (8,4))
(n, m) = tmp.shape
plt.rcParams['axes.unicode_minus'] = False
for i in range(0, m-1):
    plt.plot(range(n),(1 if i>=2 else -1)*tmp.iloc[:,i])
    plt.fill_between(range(n), (1 if i>=2 else -1)*tmp.iloc[:,i], alpha = 0.5)
plt.legend(tmp.columns[:-1])
plt.xticks(range(n),tmp.index, rotation=45)
plt.title('评论评分随日期的变化图')
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 假设 data['citys'] 是一个包含城市名称的 pandas Series
num = data['citys'].value_counts()

# 设置字体，以便支持中文字符
plt.rcParams['font.sans-serif'] = 'Simhei'

# 绘制前十条评论数量最多的城市的条形图
plt.bar(range(len(num[:10])), num[:10].values)

# 设置 x 轴的刻度标签为城市名称，并将标签旋转45度
plt.xticks(range(len(num[:10])), num[:10].index, rotation=45)

# 设置图表标题
plt.title('评论数量最多的前十个城市')

# 添加网格线
plt.grid()

# 显示图表
plt.show()

In [None]:
import jieba
import itertools
from wordcloud import WordCloud

with open('cn_stopwords.txt','r',encoding = 'utf-8') as f:
    stop = f.read()
stop = stop.split()
stop = ['','\n','这部'] + stop

data_cut = data['content'].apply(jieba.lcut)

data_after = data_cut.apply(
    lambda x:[i for i in x if i not in stop]
)

print(data_cut.head())
print(data_after.head())

In [None]:
num = pd.Series(list(itertools.chain(*list(data_after)))).value_counts()

pic = plt.imread('111.jpg')
wc = WordCloud(font_path = './data/simhei.ttf', background_color = 'White', mask = pic)
wc2 = wc.fit_words(num)

plt.imshow(wc2)
plt.axis('off')
plt.show()

In [None]:
data_good = data_after.loc[data['scores']>=30]
data_bad = data_after.loc[data['scores']<30]

def my_wc(data_after):
    tmp = list(itertools.chain(*list(data_after)))
    num = pd.Series(tmp).value_counts()
    pic = plt.imread('111.jpg')
    wc = WordCloud(background_color = 'White', font_path = './data/simhei.ttf', mask = pic)
    wc2 = wc.fit_words(num)
    plt.imshow(wc2)
    plt.axis('off')
    plt.show()

my_wc(data_bad)
my_wc(data_good)

In [None]:
# 假设 data_good 和 data_bad 是包含文本数据的DataFrame
data_new_good = pd.DataFrame()
data_new_good['text'] = data_good.apply(lambda x:''.join(x))
data_new_good['label'] = 1
data_new_good.reset_index(inplace=True, drop=True)

data_new_bad = pd.DataFrame()
data_new_bad['text'] = data_bad.apply(lambda x:''.join(x))
data_new_bad['label'] = 0
data_new_bad.reset_index(inplace=True, drop=True)

data_new = pd.concat([data_new_bad, data_new_good], axis=0)

from sklearn.model_selection import train_test_split
test_ratio = 0.2
src_training, src_testing = train_test_split(data_new, test_size=test_ratio, stratify=data_new['label'], random_state=123)

# 修正列名错误
comments_train, comments_test = src_training['text'].values, src_testing['text'].values
y_train, y_test = src_training['label'].values, src_testing['label'].values

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 创建 CountVectorizer 和 TfidfTransformer 的实例
count_vectorizer = CountVectorizer()
tfidf_transformer = TfidfTransformer()

# 使用实例方法 fit_transform 或 transform
word_count_train = count_vectorizer.fit_transform(comments_train)
tfidf_train = tfidf_transformer.fit_transform(word_count_train)

word_count_test = count_vectorizer.transform(comments_test)
tfidf_test = tfidf_transformer.transform(word_count_test)

# 打印 TF-IDF 向量的形状

print(tfidf_train.shape, tfidf_test.shape)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors = 8, weights = 'distance')
knn.fit(tfidf_train, y_train)
tfidf_y_pred = knn.predict(tfidf_test)

print('tfidf_KNeighborsClassifier test accuracy %s'% accuracy_score(y_test, tfidf_y_pred))