In [None]:
# 洗数据以提供给机器学习

import pandas as pd

# 读取CSV文件
df = pd.read_csv('log_2020_01.csv', nrows=1000000)

# 取前五百行
df = df.head(1000000)

# 删去不需要的列（假设你知道列名）
columns_to_keep = ['id', 'actor_id','actor_login',  'issue_author_type','repo_id', 'repo_name', 'created_at', 'create_description', 'pull_merged_by_type']  # 替换为你需要保留的列名
df = df[columns_to_keep]

# 保存为新的CSV文件
df.to_csv('Washed2.csv', index=False)

In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# 读取CSV文件
file_path = 'Washed2.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# 确保数据没有缺失值
df = df.dropna(subset=['created_at', 'repo_id', 'repo_name'])

# 提取created_at中的日期部分并统计频率，生成CSV表格
df['created_date'] = pd.to_datetime(df['created_at']).dt.date
date_counts = df['created_date'].value_counts().reset_index()
date_counts.columns = ['created_date', 'count']
date_counts = date_counts.sort_values('created_date')

# 保存统计频率的CSV表格
date_counts.to_csv('created_date_counts.csv', index=False)
print("created_date_counts.csv")

# 设置时间序列数据
date_counts.set_index('created_date', inplace=True)

# 拆分训练集和测试集
train_data = date_counts.iloc[:-30]
test_data = date_counts.iloc[-30:]

# 确保时间序列的稳定性
train_data_diff = train_data.diff().dropna()

# 定义并训练ARIMA模型
# 调整ARIMA模型的阶数
model = ARIMA(train_data_diff, order=(2, 1, 0))  # 降低模型阶数
model_fit = model.fit()

# 预测
forecast_diff = model_fit.forecast(steps=30)
forecast = train_data['count'].iloc[-1] + forecast_diff.cumsum()

# 创建预测结果的DataFrame
forecast_index = pd.date_range(start=test_data.index[-1] + pd.Timedelta(days=1), periods=30, freq='D')
forecast_df = pd.DataFrame(forecast.values, index=forecast_index, columns=['forecast'])
forecast_df = forecast_df.reset_index()
forecast_df.columns = ['created_date', 'forecast']

# 保存预测结果到CSV文件
forecast_df.to_csv('forecasted_created_date_counts.csv', index=False)
print("forecasted_created_date_counts.csv")

# 绘制结果
plt.figure(figsize=(12, 6))
plt.plot(train_data.index, train_data['count'], label='Training Data')
plt.plot(test_data.index, test_data['count'], label='Test Data')
plt.plot(forecast_df['created_date'], forecast_df['forecast'], label='Forecast')
plt.xlabel('Date')
plt.ylabel('Commit Count')
plt.title('Commit Count Forecast')
plt.legend()
plt.show()


In [None]:
# 重新进行大规模统计（100 0000行）

import pandas as pd

# 读取CSV文件
df = pd.read_csv('log_2020_01.csv' ,nrows = 1000000)


df = df.head(1000000)

# 删去不需要的列（假设你知道列名）


# 保存为新的CSV文件
df.to_csv('Washed4.csv', index=False)

  df = pd.read_csv('log_2020_01.csv' ,nrows = 1000000)


In [2]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('Washed4.csv')

# 剔除create_description为空的行
df_cleaned = df.dropna(subset=['create_description'])

# 按repo_id分组，统计每个repo_id的行数
repo_count = df_cleaned.groupby('repo_id').size().reset_index(name='count')

# 将结果保存为新的CSV文件
repo_count.to_csv('output.csv', index=False)

print(repo_count)


  df = pd.read_csv('Washed4.csv')
  range_count = repo_count.groupby('count_range').size().reset_index(name='repo_count')


In [1]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('Washed4.csv')

# 剔除create_description为空的行
df_cleaned = df.dropna(subset=['create_description'])

# 按repo_id分组，统计每个repo_id的行数
repo_count = df_cleaned.groupby('repo_id').size().reset_index(name='count')

# 定义count的区间
bins = [0, 1, 5, 10, 20, 50, 100, 1000]  # 自定义区间
labels = ['1 or less', '2-5', '6-10', '11-20', '21-50', '51-100', '>100']  # 给每个区间一个标签

# 将count分类
repo_count['count_range'] = pd.cut(repo_count['count'], bins=bins, labels=labels, right=False)

# 对count_range进行分组统计每个区间的repo_id数量
range_count = repo_count.groupby('count_range').size().reset_index(name='repo_count')

# 将结果保存为新的CSV文件
range_count.to_csv('output_range_count.csv', index=False)

  df = pd.read_csv('Washed4.csv')
  range_count = repo_count.groupby('count_range').size().reset_index(name='repo_count')


In [5]:
import pandas as pd

# 读取CSV文件
file_path = '/mnt/data/Washed2.csv'  # 请根据实际路径调整
df = pd.read_csv(file_path)

# 步骤1：将 issue_comments 为0的值替换为 NaN
df['issue_comments'] = df['issue_comments'].replace(0, pd.NA)

# 步骤2：剔除 issue_comments 为 NaN 的行
df_filtered = df.dropna(subset=['issue_comments'])

# 步骤3：根据 repo_id 分组，计算每组中 issue_comments 的最大值
max_issue_comments = df_filtered.groupby('repo_id')['issue_comments'].max().reset_index()

# 步骤4：定义分类区间
bins = [0, 10, 50, 100, 500, 1000, float('inf')]  # 定义区间范围
labels = ['0-10', '11-50', '51-100', '101-500', '501-1000', '1000+']  # 区间标签

# 步骤5：根据 issue_comments 对仓库进行分类
max_issue_comments['issue_comments_range'] = pd.cut(max_issue_comments['issue_comments'], bins=bins, labels=labels, right=False)

# 步骤6：统计每个区间内的仓库数量
range_counts = max_issue_comments['issue_comments_range'].value_counts().reset_index()
range_counts.columns = ['issue_comments_range', 'repo_count']

# 步骤7：保存统计结果为新的CSV文件
range_counts.to_csv('create_description_range_counts.csv', index=False)

print("Issue comments range counts saved to create_description_range_counts.csv")



Issue comments range counts saved to issue_comments_range_counts.csv


In [None]:


import pandas as pd

# 读取CSV文件
df = pd.read_csv('log_2020_01.csv' ,nrows = 1000000)


df = df.head(1000000)

# 删去不需要的列（假设你知道列名）
columns_to_keep = ['id','type', 'actor_id','actor_login',  'issue_author_type','repo_id', 'repo_name', 'created_at', 'create_description', 'pull_merged_by_type', 'repo_language']  # 替换为你需要保留的列名
df = df[columns_to_keep]

# 保存为新的CSV文件
df.to_csv('Washed1.csv', index=False)

In [None]:
import pandas as pd

# 读取CSV文件
file_path = 'Washed1.csv'
df = pd.read_csv(file_path)

# 统计repo_id的出现次数并生成CSV表格
repo_id_counts = df['repo_id'].value_counts().reset_index()
repo_id_counts.columns = ['repo_id', 'count']
result = pd.merge(repo_id_counts, df[['repo_id', 'repo_name']].drop_duplicates(), on='repo_id')
result.to_csv('repo_id_counts.csv', index=False)
print("Repo ID counts saved to repo_id_counts.csv")

# 提取created_at中的日期部分并统计频率，生成CSV表格
df['created_date'] = pd.to_datetime(df['created_at']).dt.date
date_counts = df['created_date'].value_counts().reset_index()
date_counts.columns = ['created_date', 'count']
date_counts.to_csv('created_date_counts.csv', index=False)
print("Created date counts saved to created_date_counts.csv")


In [None]:
import pandas as pd

# 读取CSV文件
file_path = 'Washed1.csv' 
df = pd.read_csv(file_path)

# 统计repo_language的出现次数
language_counts = df['repo_language'].value_counts().reset_index()
language_counts.columns = ['repo_language', 'count']

# 保存为新的CSV文件
language_counts.to_csv('repo_language_counts.csv', index=False)

print("Repo language counts saved to repo_language_counts.csv")

In [None]:
# 重新进行大规模统计（100 0000行）

import pandas as pd

# 读取CSV文件
df = pd.read_csv('log_2020_01.csv' ,nrows = 1000000)


df = df.head(1000000)

# 删去不需要的列（假设你知道列名）
columns_to_keep = ['id','type', 'actor_id','actor_login',  'issue_author_type','repo_id', 'repo_name', 'created_at', 'create_description', 'pull_merged_by_type', 'repo_language']  # 替换为你需要保留的列名
df = df[columns_to_keep]

# 保存为新的CSV文件
df.to_csv('Washed1.csv', index=False)

In [None]:
import pandas as pd

# 读取CSV文件
file_path = 'Washed1.csv'
df = pd.read_csv(file_path)

# 统计repo_id的出现次数并生成CSV表格
repo_id_counts = df['repo_id'].value_counts().reset_index()
repo_id_counts.columns = ['repo_id', 'count']
result = pd.merge(repo_id_counts, df[['repo_id', 'repo_name']].drop_duplicates(), on='repo_id')
result.to_csv('repo_id_counts.csv', index=False)
print("Repo ID counts saved to repo_id_counts.csv")

# 提取created_at中的日期部分并统计频率，生成CSV表格
df['created_date'] = pd.to_datetime(df['created_at']).dt.date
date_counts = df['created_date'].value_counts().reset_index()
date_counts.columns = ['created_date', 'count']
date_counts.to_csv('created_date_counts.csv', index=False)
print("Created date counts saved to created_date_counts.csv")


In [None]:
import pandas as pd

# 读取CSV文件
file_path = 'Washed1.csv' 
df = pd.read_csv(file_path)

# 统计repo_language的出现次数
language_counts = df['repo_language'].value_counts().reset_index()
language_counts.columns = ['repo_language', 'count']

# 保存为新的CSV文件
language_counts.to_csv('repo_language_counts.csv', index=False)

print("Repo language counts saved to repo_language_counts.csv")